Spaces:
Running
Running
Should work again with 429 handling and updated key
Browse files- README.md +6 -4
- data/nlp_papers_citation_age.txt +0 -0
- main.py +60 -48
- plots.py +9 -10
- s2.py +87 -42
README.md
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
---
|
2 |
-
title: Field
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
-
|
|
|
|
|
8 |
---
|
9 |
|
10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Citation Field x Age
|
3 |
+
emoji: π
|
4 |
colorFrom: pink
|
5 |
+
colorTo: indigo
|
6 |
sdk: docker
|
7 |
+
app_file: app.py
|
8 |
+
pinned: true
|
9 |
+
app_port: 7860
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
data/nlp_papers_citation_age.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
main.py
CHANGED
@@ -9,10 +9,15 @@ import gradio as gr
|
|
9 |
|
10 |
from aclanthology import determine_page_type
|
11 |
from plots import generate_cfdi_plot, generate_maoc_plot
|
12 |
-
from s2 import (
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
def return_clear():
|
@@ -21,7 +26,7 @@ def return_clear():
|
|
21 |
Returns:
|
22 |
None
|
23 |
"""
|
24 |
-
return None, None, None, None, None, None, None, None
|
25 |
|
26 |
|
27 |
def create_compute_stats(submit_type=None):
|
@@ -92,7 +97,7 @@ def plot_and_return_stats(
|
|
92 |
plot_cfdi = generate_cfdi_plot(cfdi, compute_type)
|
93 |
|
94 |
# Generate cadi plot
|
95 |
-
|
96 |
|
97 |
# Get top 3 most cited fields
|
98 |
top_fields_text = "\n".join(
|
@@ -103,72 +108,82 @@ def plot_and_return_stats(
|
|
103 |
)[:3]
|
104 |
]
|
105 |
)
|
106 |
-
|
107 |
-
cfdi = round(cfdi, 3)
|
108 |
|
109 |
# Get most common oldest papers
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
114 |
|
115 |
return (
|
116 |
title_authors,
|
117 |
num_references,
|
118 |
top_fields_text,
|
119 |
-
|
120 |
cfdi,
|
121 |
-
|
122 |
plot_cfdi,
|
123 |
-
|
124 |
)
|
125 |
|
126 |
|
127 |
-
with gr.Blocks(
|
128 |
-
theme=gr.themes.Soft()
|
129 |
-
) as demo:
|
130 |
with gr.Row():
|
131 |
gr.Markdown(
|
132 |
"""
|
133 |
-
# Citation Field Diversity Calculator
|
134 |
|
135 |
-
|
|
|
|
|
|
|
|
|
136 |
|
137 |
- By whom am I influenced? Which fields heavily inform and shape the research trajectory of my works?
|
|
|
138 |
|
139 |
-
In addition, you will be able to analyze how the above compares to the average paper or author. The results you will receive
|
140 |
|
141 |
-
- Am I reading widely across fields?
|
142 |
- Should I expand my literature search to include works from other fields?
|
|
|
143 |
|
144 |
-
Using citations as a tangible marker of influence, our demo provides empirical insights into the influence of papers across fields.
|
145 |
|
146 |
## What is Citation Field Diversity?
|
147 |
|
148 |
-
Field diversity is a measure of the variety of research
|
149 |
-
|
150 |
-
## What is the Citation Field Diversity Index (CFDI) and how is it calculated?
|
151 |
|
152 |
-
|
|
|
|
|
153 |
|
154 |
-
For more details, please refer to Eq. 3 in [this paper](https://aclanthology.org/2023.acl-long.341/).
|
155 |
"""
|
156 |
)
|
157 |
-
|
158 |
gr.Markdown(
|
159 |
"""
|
160 |
-
##
|
161 |
|
162 |
-
|
|
|
|
|
|
|
|
|
163 |
|
|
|
|
|
164 |
## How can I use this demo?
|
165 |
|
166 |
-
There are three ways
|
167 |
1. **Semantic Scholar ID**: Enter the Semantic Scholar ID of a **paper** or **author** and click the *"Compute"* button.
|
168 |
2. **ACL Anthology Link**: Paste the ACL Anthology link of a **paper**, **venue**, or **author** and click the *"Compute"* button.
|
169 |
3. **PDF File**: Upload your **paper** PDF and click the *"Compute"* button.
|
170 |
-
|
171 |
-
To retrieve the **Semantic Scholar ID** for a paper such as "The Elephant in the Room: Analyzing the Presence of Big Tech in Natural Language Processing Research," search the paper on Semantic Scholar [here](https://www.semanticscholar.org/paper/The-Elephant-in-the-Room%3A-Analyzing-the-Presence-of-Abdalla-Wahle/587ffdfd7229e8e0dbc5250b44df5fad6251f6ad) and use the last part of the URL. The Semantic Scholar ID (SSID) for this paper is: **587ffdfd7229e8e0dbc5250b44df5fad6251f6ad**.
|
172 |
|
173 |
To get an ACL Anthology link, you can go to any ACL Anthology paper, author or proceedings page and just copy and paste the url. For example:
|
174 |
- https://aclanthology.org/2023.acl-long.1/
|
@@ -197,9 +212,7 @@ with gr.Blocks(
|
|
197 |
with gr.Row():
|
198 |
acl_submit_btn = gr.Button("Compute")
|
199 |
with gr.TabItem("PDF File"):
|
200 |
-
pdf_file = gr.File(
|
201 |
-
file_types=[".pdf"], label="Upload your paper PDF"
|
202 |
-
)
|
203 |
with gr.Row():
|
204 |
file_submit_btn = gr.Button("Compute")
|
205 |
with gr.Row():
|
@@ -209,13 +222,13 @@ with gr.Blocks(
|
|
209 |
with gr.Row():
|
210 |
num_ref = gr.Textbox(label="Number of references", lines=3)
|
211 |
top_field_list = gr.Textbox(label="Top 3 fields cited:", lines=3)
|
212 |
-
|
213 |
with gr.Row():
|
214 |
cfdi = gr.Textbox(label="CFDI")
|
215 |
-
|
216 |
with gr.Row():
|
217 |
cfdi_plot = gr.Plot(label="Citation Field Diversity")
|
218 |
-
|
219 |
with gr.Row():
|
220 |
clear_btn = gr.Button("Clear")
|
221 |
|
@@ -225,11 +238,11 @@ with gr.Blocks(
|
|
225 |
title,
|
226 |
num_ref,
|
227 |
top_field_list,
|
228 |
-
|
229 |
cfdi,
|
230 |
-
|
231 |
cfdi_plot,
|
232 |
-
|
233 |
],
|
234 |
)
|
235 |
|
@@ -256,16 +269,15 @@ with gr.Blocks(
|
|
256 |
title,
|
257 |
num_ref,
|
258 |
top_field_list,
|
259 |
-
|
260 |
cfdi,
|
261 |
-
|
262 |
cfdi_plot,
|
263 |
-
|
264 |
s2_id,
|
265 |
acl_link,
|
266 |
pdf_file,
|
267 |
],
|
268 |
)
|
269 |
|
270 |
-
demo.
|
271 |
-
demo.launch(server_port=7860, server_name="0.0.0.0")
|
|
|
9 |
|
10 |
from aclanthology import determine_page_type
|
11 |
from plots import generate_cfdi_plot, generate_maoc_plot
|
12 |
+
from s2 import (
|
13 |
+
check_s2_id_type,
|
14 |
+
compute_stats_for_acl_author,
|
15 |
+
compute_stats_for_acl_paper,
|
16 |
+
compute_stats_for_acl_venue,
|
17 |
+
compute_stats_for_pdf,
|
18 |
+
compute_stats_for_s2_author,
|
19 |
+
compute_stats_for_s2_paper,
|
20 |
+
)
|
21 |
|
22 |
|
23 |
def return_clear():
|
|
|
26 |
Returns:
|
27 |
None
|
28 |
"""
|
29 |
+
return None, None, None, None, None, None, None, None, None, None, None
|
30 |
|
31 |
|
32 |
def create_compute_stats(submit_type=None):
|
|
|
97 |
plot_cfdi = generate_cfdi_plot(cfdi, compute_type)
|
98 |
|
99 |
# Generate cadi plot
|
100 |
+
plot_maoc = generate_maoc_plot(maoc, compute_type)
|
101 |
|
102 |
# Get top 3 most cited fields
|
103 |
top_fields_text = "\n".join(
|
|
|
108 |
)[:3]
|
109 |
]
|
110 |
)
|
|
|
|
|
111 |
|
112 |
# Get most common oldest papers
|
113 |
+
oldest_paper_text = "".join(
|
114 |
+
f"[{str(year)}] {title}" + "\n"
|
115 |
+
for year, title in sorted(year_title_dict.items())[:3]
|
116 |
+
)
|
117 |
+
|
118 |
+
# Round CFDI and CADI
|
119 |
+
cfdi = round(cfdi, 3)
|
120 |
+
cadi = round(cadi, 3)
|
121 |
|
122 |
return (
|
123 |
title_authors,
|
124 |
num_references,
|
125 |
top_fields_text,
|
126 |
+
oldest_paper_text,
|
127 |
cfdi,
|
128 |
+
cadi,
|
129 |
plot_cfdi,
|
130 |
+
plot_maoc,
|
131 |
)
|
132 |
|
133 |
|
134 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
|
|
135 |
with gr.Row():
|
136 |
gr.Markdown(
|
137 |
"""
|
138 |
+
# Citation Age and Field Diversity Calculator
|
139 |
|
140 |
+
<div align="center">
|
141 |
+
<img src="https://onedrive.live.com/embed?resid=684CB5200DB6B388%21682618&authkey=%21AILbTZikzXAbAyc&width=1310&height=728" />
|
142 |
+
</div>
|
143 |
+
|
144 |
+
Welcome to this interactive demo to analyze various aspects of your citational diversity. This tool will enable you to reflect on two critical aspects:
|
145 |
|
146 |
- By whom am I influenced? Which fields heavily inform and shape the research trajectory of my works?
|
147 |
+
- How far back in time do I cite? What are critical works (present and past) that shape my research?
|
148 |
|
149 |
+
In addition, you will be able to analyze how the above compares to the average paper or author. The results you will receive can not be categorized into βgoodβ or βbadβ. Instead, they are meant to raise self-awareness about oneβs citational diversity and reflect on it. The results might bring you to further questions, such as:
|
150 |
|
151 |
+
- Am I reading widely across fields and time?
|
152 |
- Should I expand my literature search to include works from other fields?
|
153 |
+
- Are there ideas rooted in the past that can be used in an innovative way?
|
154 |
|
155 |
+
Using citations as a tangible marker of influence, our demo provides empirical insights into the influence of papers across fields and time.
|
156 |
|
157 |
## What is Citation Field Diversity?
|
158 |
|
159 |
+
Field diversity is a measure of the variety of research Fields that a paper or an author draws upon. A high field diversity indicates that the work draws from various distinct research fields, demonstrating a multidisciplinary influence on that work or author.
|
|
|
|
|
160 |
|
161 |
+
## What is Citation Age Diversity?
|
162 |
+
|
163 |
+
Citation age is a measure of how far back in time a paper cites other papers. A high citation age shows that the work draws from past works, while a low citation age indicates that mostly recent work has influenced that paper.
|
164 |
|
|
|
165 |
"""
|
166 |
)
|
|
|
167 |
gr.Markdown(
|
168 |
"""
|
169 |
+
## What are the Citation Field Diversity Index (CFDI) and Citation Age Diversity Index (CADI) and how are they calculated?
|
170 |
|
171 |
+
The calculation of Field Diversity involves extracting all the references of a paper, categorizing them into distinct study fields, and determining the proportion of each study field over all the references. The Citation Field Diversity Index (CFDI) is then computed by applying the Gini Index on these proportions.
|
172 |
+
Calculating CADI is similar to CFDI but instead of determining the proportion of each study field, we determine the proportion of citation ages. If we take a paper from 2020 that cites two papers, one from 2010 and one from 1990, the citation ages are 10 and 30, respectively. The CADI is then computed by applying the Gini Index on these ages.
|
173 |
+
For more details, please refer to Eq. 3 in [this paper](https://aclanthology.org/2023.acl-long.341/) and Eq. 4 in [this paper](https://arxiv.org/).
|
174 |
+
|
175 |
+
## How do I Interpret CFDI and CADI?
|
176 |
|
177 |
+
For both indices, higher values indicate a greater diversity of a NLP paper (in terms of how far back it cites and in the fields it cites). On the other hand, lower values signify a lower diversity, indicating that citations are more concentrated in specific fields and time ranges.
|
178 |
+
|
179 |
## How can I use this demo?
|
180 |
|
181 |
+
There are three ways how you to compute the field and age diversity for papers:
|
182 |
1. **Semantic Scholar ID**: Enter the Semantic Scholar ID of a **paper** or **author** and click the *"Compute"* button.
|
183 |
2. **ACL Anthology Link**: Paste the ACL Anthology link of a **paper**, **venue**, or **author** and click the *"Compute"* button.
|
184 |
3. **PDF File**: Upload your **paper** PDF and click the *"Compute"* button.
|
185 |
+
|
186 |
+
To retrieve the **Semantic Scholar ID** for a paper such as "The Elephant in the Room: Analyzing the Presence of Big Tech in Natural Language Processing Research," search the paper on Semantic Scholar [here](https://www.semanticscholar.org/paper/The-Elephant-in-the-Room%3A-Analyzing-the-Presence-of-Abdalla-Wahle/587ffdfd7229e8e0dbc5250b44df5fad6251f6ad) and use the last part of the URL. The Semantic Scholar ID (SSID) for this paper is: **587ffdfd7229e8e0dbc5250b44df5fad6251f6ad**.
|
187 |
|
188 |
To get an ACL Anthology link, you can go to any ACL Anthology paper, author or proceedings page and just copy and paste the url. For example:
|
189 |
- https://aclanthology.org/2023.acl-long.1/
|
|
|
212 |
with gr.Row():
|
213 |
acl_submit_btn = gr.Button("Compute")
|
214 |
with gr.TabItem("PDF File"):
|
215 |
+
pdf_file = gr.File(file_types=[".pdf"], label="Upload your paper PDF")
|
|
|
|
|
216 |
with gr.Row():
|
217 |
file_submit_btn = gr.Button("Compute")
|
218 |
with gr.Row():
|
|
|
222 |
with gr.Row():
|
223 |
num_ref = gr.Textbox(label="Number of references", lines=3)
|
224 |
top_field_list = gr.Textbox(label="Top 3 fields cited:", lines=3)
|
225 |
+
top_age_list = gr.Textbox(label="Top 3 oldest papers cited:", lines=3)
|
226 |
with gr.Row():
|
227 |
cfdi = gr.Textbox(label="CFDI")
|
228 |
+
cadi = gr.Textbox(label="CADI")
|
229 |
with gr.Row():
|
230 |
cfdi_plot = gr.Plot(label="Citation Field Diversity")
|
231 |
+
cadi_plot = gr.Plot(label="Citation Age Diversity")
|
232 |
with gr.Row():
|
233 |
clear_btn = gr.Button("Clear")
|
234 |
|
|
|
238 |
title,
|
239 |
num_ref,
|
240 |
top_field_list,
|
241 |
+
top_age_list,
|
242 |
cfdi,
|
243 |
+
cadi,
|
244 |
cfdi_plot,
|
245 |
+
cadi_plot,
|
246 |
],
|
247 |
)
|
248 |
|
|
|
269 |
title,
|
270 |
num_ref,
|
271 |
top_field_list,
|
272 |
+
top_age_list,
|
273 |
cfdi,
|
274 |
+
cadi,
|
275 |
cfdi_plot,
|
276 |
+
cadi_plot,
|
277 |
s2_id,
|
278 |
acl_link,
|
279 |
pdf_file,
|
280 |
],
|
281 |
)
|
282 |
|
283 |
+
demo.launch(server_port=7860, server_name="127.0.0.1")
|
|
plots.py
CHANGED
@@ -22,16 +22,15 @@ mean_cfdi = papers_df["incoming_diversity"].mean()
|
|
22 |
# Compute the mean CADI
|
23 |
mean_citation_ages = []
|
24 |
|
25 |
-
#
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
#| mean_citation_ages.append(temp)
|
35 |
|
36 |
|
37 |
def generate_cfdi_plot(input_cfdi, compute_type="paper"):
|
|
|
22 |
# Compute the mean CADI
|
23 |
mean_citation_ages = []
|
24 |
|
25 |
+
# Open the file and read the content in a list
|
26 |
+
with open(
|
27 |
+
os.path.join(dirname, "data/nlp_papers_citation_age.txt"),
|
28 |
+
"r",
|
29 |
+
encoding="utf-8",
|
30 |
+
) as filehandle:
|
31 |
+
for line in filehandle:
|
32 |
+
temp = float(line[:-1])
|
33 |
+
mean_citation_ages.append(temp)
|
|
|
34 |
|
35 |
|
36 |
def generate_cfdi_plot(input_cfdi, compute_type="paper"):
|
s2.py
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
import asyncio
|
6 |
import datetime
|
7 |
import os
|
|
|
8 |
from collections import Counter
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
from typing import List, Tuple
|
@@ -12,8 +13,12 @@ from typing import List, Tuple
|
|
12 |
import aiohttp
|
13 |
import requests
|
14 |
|
15 |
-
from aclanthology import (
|
16 |
-
|
|
|
|
|
|
|
|
|
17 |
from metrics import calculate_gini, calculate_gini_simpson
|
18 |
from pdf import parse_pdf_to_artcile_dict
|
19 |
|
@@ -34,21 +39,47 @@ def get_or_create_eventloop():
|
|
34 |
return asyncio.get_event_loop()
|
35 |
|
36 |
|
37 |
-
def send_s2_request(request_url):
|
38 |
"""
|
39 |
Sends a GET request to the specified URL with the S2 API key in the headers.
|
40 |
|
|
|
|
|
|
|
|
|
|
|
41 |
Args:
|
42 |
request_url (str): The URL to send the request to.
|
|
|
|
|
43 |
|
44 |
Returns:
|
45 |
-
requests.Response: The response object
|
46 |
"""
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
|
54 |
def check_s2_id_type(semantic_scholar_id):
|
@@ -64,9 +95,9 @@ def check_s2_id_type(semantic_scholar_id):
|
|
64 |
if the ID is not valid for either a paper or an author.
|
65 |
"""
|
66 |
# First, check if it's a paper ID
|
67 |
-
paper_response =
|
68 |
f"https://api.semanticscholar.org/v1/paper/{semantic_scholar_id}",
|
69 |
-
|
70 |
)
|
71 |
|
72 |
# If the response status code is 200, it means the ID is valid for a paper
|
@@ -74,17 +105,19 @@ def check_s2_id_type(semantic_scholar_id):
|
|
74 |
return "paper", None
|
75 |
|
76 |
# Next, check if it's an author ID
|
77 |
-
author_response =
|
78 |
f"https://api.semanticscholar.org/v1/author/{semantic_scholar_id}",
|
79 |
-
|
80 |
)
|
81 |
|
82 |
# If the response status code is 200, it means the ID is valid for an author
|
83 |
return (
|
84 |
"author",
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
88 |
)
|
89 |
|
90 |
|
@@ -167,29 +200,18 @@ def compute_stats_for_references(s2_ref_paper_keys, year):
|
|
167 |
year_ref for year_ref in reference_year_list if year_ref is not None
|
168 |
]
|
169 |
reference_title_list = [
|
170 |
-
title_ref
|
171 |
-
for title_ref in reference_title_list
|
172 |
-
if title_ref is not None
|
173 |
]
|
174 |
|
175 |
# Count references
|
176 |
num_references = len(reference_year_list)
|
177 |
|
178 |
# Flatten list and count occurrences
|
179 |
-
fields_of_study_counts = dict(
|
180 |
-
Counter(
|
181 |
-
[
|
182 |
-
field
|
183 |
-
for field in reference_fos_list
|
184 |
-
]
|
185 |
-
)
|
186 |
-
)
|
187 |
|
188 |
# Citation age list
|
189 |
aoc_list = [
|
190 |
-
year - year_ref
|
191 |
-
for year_ref in reference_year_list
|
192 |
-
if year_ref and year
|
193 |
]
|
194 |
if not aoc_list:
|
195 |
return None, None, None, None, None, None
|
@@ -253,9 +275,7 @@ def compute_stats_for_s2_paper(ssid_paper_id):
|
|
253 |
result["year"],
|
254 |
result["authors"],
|
255 |
)
|
256 |
-
title_authors = (
|
257 |
-
title + "\n" + ", ".join([author["name"] for author in authors])
|
258 |
-
)
|
259 |
|
260 |
(
|
261 |
num_references,
|
@@ -304,12 +324,14 @@ def compute_stats_for_acl_paper(url):
|
|
304 |
Returns:
|
305 |
dict: A dictionary containing statistics for the paper, or None if the paper was not found.
|
306 |
"""
|
|
|
307 |
if paper_info := extract_paper_info(url):
|
308 |
loop = get_or_create_eventloop()
|
309 |
# Match paper ID to Semantic Scholar ID
|
310 |
s2_paper = loop.run_until_complete(
|
311 |
async_match_acl_id_to_s2_paper(paper_info["acl_id"])
|
312 |
)
|
|
|
313 |
return compute_stats_for_s2_paper(s2_paper["paperId"])
|
314 |
return None
|
315 |
|
@@ -420,19 +442,44 @@ def compute_stats_for_multiple_s2_papers(
|
|
420 |
)
|
421 |
|
422 |
|
423 |
-
async def send_s2_async_request(url):
|
424 |
"""
|
425 |
-
Sends an asynchronous request to the specified URL and returns the
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
Args:
|
428 |
url (str): The URL to send the request to.
|
|
|
|
|
429 |
|
430 |
Returns:
|
431 |
-
dict: The response
|
432 |
"""
|
433 |
-
|
434 |
-
|
435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
436 |
|
437 |
|
438 |
async def match_title_to_s2_paper(title, authors=None):
|
@@ -447,9 +494,7 @@ async def match_title_to_s2_paper(title, authors=None):
|
|
447 |
str or None: Returns the S2 paper ID if found, otherwise None.
|
448 |
"""
|
449 |
# Send a request to the Semantic Scholar API to search for the paper by its title
|
450 |
-
search_url =
|
451 |
-
f"http://api.semanticscholar.org/graph/v1/paper/search?query={title}"
|
452 |
-
)
|
453 |
|
454 |
# Send request
|
455 |
response = await send_s2_async_request(search_url)
|
|
|
5 |
import asyncio
|
6 |
import datetime
|
7 |
import os
|
8 |
+
import time # Added to implement retry delays
|
9 |
from collections import Counter
|
10 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
11 |
from typing import List, Tuple
|
|
|
13 |
import aiohttp
|
14 |
import requests
|
15 |
|
16 |
+
from aclanthology import (
|
17 |
+
async_match_acl_id_to_s2_paper,
|
18 |
+
extract_author_info,
|
19 |
+
extract_paper_info,
|
20 |
+
extract_venue_info,
|
21 |
+
)
|
22 |
from metrics import calculate_gini, calculate_gini_simpson
|
23 |
from pdf import parse_pdf_to_artcile_dict
|
24 |
|
|
|
39 |
return asyncio.get_event_loop()
|
40 |
|
41 |
|
42 |
+
def send_s2_request(request_url, max_retries: int = 3):
|
43 |
"""
|
44 |
Sends a GET request to the specified URL with the S2 API key in the headers.
|
45 |
|
46 |
+
If the request is rate-limited (HTTP 429), it will be retried after the
|
47 |
+
delay specified in the ``Retry-After`` header. A maximum of
|
48 |
+
``max_retries`` additional attempts will be made before the response is
|
49 |
+
returned as-is.
|
50 |
+
|
51 |
Args:
|
52 |
request_url (str): The URL to send the request to.
|
53 |
+
max_retries (int, optional): Maximum number of retries after a 429
|
54 |
+
response. Defaults to 3.
|
55 |
|
56 |
Returns:
|
57 |
+
requests.Response: The final response object.
|
58 |
"""
|
59 |
+
|
60 |
+
for attempt in range(max_retries + 1):
|
61 |
+
response = requests.get(
|
62 |
+
request_url,
|
63 |
+
headers={"x-api-key": os.environ["s2apikey"]},
|
64 |
+
timeout=10,
|
65 |
+
)
|
66 |
+
|
67 |
+
# Return early if not rate-limited or retries exhausted
|
68 |
+
if response.status_code != 429 or attempt == max_retries:
|
69 |
+
return response
|
70 |
+
|
71 |
+
print(response.status_code)
|
72 |
+
print(response.headers)
|
73 |
+
print(response.text)
|
74 |
+
# Respect the Retry-After header if present
|
75 |
+
retry_after_header = response.headers.get("Retry-After", "3")
|
76 |
+
try:
|
77 |
+
wait_seconds = int(retry_after_header)
|
78 |
+
except ValueError:
|
79 |
+
# Header could be an HTTP-date; fall back to 3 seconds
|
80 |
+
wait_seconds = 3
|
81 |
+
|
82 |
+
time.sleep(wait_seconds)
|
83 |
|
84 |
|
85 |
def check_s2_id_type(semantic_scholar_id):
|
|
|
95 |
if the ID is not valid for either a paper or an author.
|
96 |
"""
|
97 |
# First, check if it's a paper ID
|
98 |
+
paper_response = send_s2_request(
|
99 |
f"https://api.semanticscholar.org/v1/paper/{semantic_scholar_id}",
|
100 |
+
max_retries=3,
|
101 |
)
|
102 |
|
103 |
# If the response status code is 200, it means the ID is valid for a paper
|
|
|
105 |
return "paper", None
|
106 |
|
107 |
# Next, check if it's an author ID
|
108 |
+
author_response = send_s2_request(
|
109 |
f"https://api.semanticscholar.org/v1/author/{semantic_scholar_id}",
|
110 |
+
max_retries=3,
|
111 |
)
|
112 |
|
113 |
# If the response status code is 200, it means the ID is valid for an author
|
114 |
return (
|
115 |
"author",
|
116 |
+
(
|
117 |
+
author_response.json()["name"]
|
118 |
+
if author_response.status_code == 200
|
119 |
+
else "invalid"
|
120 |
+
),
|
121 |
)
|
122 |
|
123 |
|
|
|
200 |
year_ref for year_ref in reference_year_list if year_ref is not None
|
201 |
]
|
202 |
reference_title_list = [
|
203 |
+
title_ref for title_ref in reference_title_list if title_ref is not None
|
|
|
|
|
204 |
]
|
205 |
|
206 |
# Count references
|
207 |
num_references = len(reference_year_list)
|
208 |
|
209 |
# Flatten list and count occurrences
|
210 |
+
fields_of_study_counts = dict(Counter([field for field in reference_fos_list]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
# Citation age list
|
213 |
aoc_list = [
|
214 |
+
year - year_ref for year_ref in reference_year_list if year_ref and year
|
|
|
|
|
215 |
]
|
216 |
if not aoc_list:
|
217 |
return None, None, None, None, None, None
|
|
|
275 |
result["year"],
|
276 |
result["authors"],
|
277 |
)
|
278 |
+
title_authors = title + "\n" + ", ".join([author["name"] for author in authors])
|
|
|
|
|
279 |
|
280 |
(
|
281 |
num_references,
|
|
|
324 |
Returns:
|
325 |
dict: A dictionary containing statistics for the paper, or None if the paper was not found.
|
326 |
"""
|
327 |
+
print(extract_paper_info(url))
|
328 |
if paper_info := extract_paper_info(url):
|
329 |
loop = get_or_create_eventloop()
|
330 |
# Match paper ID to Semantic Scholar ID
|
331 |
s2_paper = loop.run_until_complete(
|
332 |
async_match_acl_id_to_s2_paper(paper_info["acl_id"])
|
333 |
)
|
334 |
+
print(s2_paper)
|
335 |
return compute_stats_for_s2_paper(s2_paper["paperId"])
|
336 |
return None
|
337 |
|
|
|
442 |
)
|
443 |
|
444 |
|
445 |
+
async def send_s2_async_request(url, max_retries: int = 3):
|
446 |
"""
|
447 |
+
Sends an asynchronous GET request to the specified URL and returns the
|
448 |
+
response body as JSON.
|
449 |
+
|
450 |
+
Similar to :pyfunc:`send_s2_request`, this helper transparently retries
|
451 |
+
when the Semantic Scholar API responds with HTTP 429. The delay before
|
452 |
+
retrying is taken from the ``Retry-After`` header.
|
453 |
|
454 |
Args:
|
455 |
url (str): The URL to send the request to.
|
456 |
+
max_retries (int, optional): Maximum number of retries after a 429
|
457 |
+
response. Defaults to 3.
|
458 |
|
459 |
Returns:
|
460 |
+
dict: The response parsed as JSON.
|
461 |
"""
|
462 |
+
|
463 |
+
headers = {"x-api-key": os.environ.get("s2apikey", "")}
|
464 |
+
timeout = aiohttp.ClientTimeout(total=10)
|
465 |
+
|
466 |
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
467 |
+
for attempt in range(max_retries + 1):
|
468 |
+
async with session.get(url, headers=headers) as response:
|
469 |
+
if response.status != 429 or attempt == max_retries:
|
470 |
+
return await response.json()
|
471 |
+
|
472 |
+
print(response.status)
|
473 |
+
print(response.headers)
|
474 |
+
print(response.text)
|
475 |
+
|
476 |
+
retry_after_header = response.headers.get("Retry-After", "3")
|
477 |
+
try:
|
478 |
+
wait_seconds = int(retry_after_header)
|
479 |
+
except ValueError:
|
480 |
+
wait_seconds = 3
|
481 |
+
|
482 |
+
await asyncio.sleep(wait_seconds)
|
483 |
|
484 |
|
485 |
async def match_title_to_s2_paper(title, authors=None):
|
|
|
494 |
str or None: Returns the S2 paper ID if found, otherwise None.
|
495 |
"""
|
496 |
# Send a request to the Semantic Scholar API to search for the paper by its title
|
497 |
+
search_url = f"http://api.semanticscholar.org/graph/v1/paper/search?query={title}"
|
|
|
|
|
498 |
|
499 |
# Send request
|
500 |
response = await send_s2_async_request(search_url)
|