buelfhood commited on
Commit
338915e
·
verified ·
1 Parent(s): cf98a8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -100
app.py CHANGED
@@ -1,108 +1,140 @@
1
- import datetime
2
- import os
3
- import tempfile
4
  import requests
5
- import feedparser
 
 
 
 
6
  import gradio as gr
7
- from transformers import pipeline
8
- import pdfkit
9
-
10
- # Initialize the summarizer (this may download the model on first run)
11
- summarizer = pipeline("summarization")
12
-
13
- def summarize_papers(topic, start_date, end_date, output_format):
14
- """
15
- Fetch papers from arXiv via its direct API based on a topic and date range,
16
- summarize their abstracts, and return the result as either a markdown string
17
- or a PDF file.
18
-
19
- Parameters:
20
- topic (str): The subject/topic to search for.
21
- start_date (str): Start date in 'YYYY-MM-DD' format.
22
- end_date (str): End date in 'YYYY-MM-DD' format.
23
- output_format (str): "markdown" or "pdf".
24
-
25
- Returns:
26
- str: Markdown text or file path to the generated PDF.
27
- """
28
  try:
29
- start_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
30
- end_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
31
- except ValueError:
32
- return "Invalid date format. Please use YYYY-MM-DD."
33
-
34
- # Build the API query URL using the direct ArXiv API.
35
- base_url = "http://export.arxiv.org/api/query"
36
- search_query = "all:" + topic
37
  params = {
38
- "search_query": search_query,
39
  "start": 0,
40
- "max_results": 50, # Adjust as needed
41
  "sortBy": "submittedDate",
42
- "sortOrder": "descending"
43
  }
44
-
45
- # Fetch the XML feed from ArXiv
46
- response = requests.get(base_url, params=params)
47
- feed = feedparser.parse(response.text)
48
-
49
- markdown_output = f"# ArXiv Papers on '{topic}'\n\n"
50
- found = False
51
-
52
- for entry in feed.entries:
53
- # Parse the published date (e.g., '2022-01-15T18:00:00Z')
54
- entry_date = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
55
- if start_dt <= entry_date <= end_dt:
56
- found = True
57
- # Summarize the abstract using Hugging Face pipeline
58
- try:
59
- summary_text = summarizer(entry.summary, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
60
- except Exception as e:
61
- summary_text = "Error in summarization."
62
-
63
- # Build markdown content for each paper
64
- markdown_output += f"## {entry.title}\n"
65
- # entry.authors is a list of dicts with a 'name' field
66
- authors = ", ".join([author.name for author in entry.authors])
67
- markdown_output += f"**Authors:** {authors}\n\n"
68
- markdown_output += f"**Published:** {entry_date.strftime('%Y-%m-%d')}\n\n"
69
- markdown_output += f"**Summary:** {summary_text}\n\n"
70
- markdown_output += f"**Link:** [arXiv]({entry.id})\n\n"
71
- markdown_output += "---\n\n"
72
-
73
- if not found:
74
- markdown_output += "\n_No papers found within the specified date range._\n"
75
-
76
- if output_format == "markdown":
77
- return markdown_output
78
- elif output_format == "pdf":
79
- # Convert markdown to a simple HTML wrapper for PDF conversion.
80
- html_content = "<html><body>" + markdown_output.replace("\n", "<br>") + "</body></html>"
81
- temp_dir = tempfile.gettempdir()
82
- pdf_path = os.path.join(temp_dir, "arxiv_summary.pdf")
83
- try:
84
- pdfkit.from_string(html_content, pdf_path)
85
- return pdf_path # Gradio will treat this as a downloadable file.
86
- except Exception as e:
87
- return f"PDF conversion failed: {e}"
88
  else:
89
- return "Unsupported output format."
90
-
91
- # Build the Gradio interface.
92
- iface = gr.Interface(
93
- fn=summarize_papers,
94
- inputs=[
95
- gr.Textbox(label="Topic", placeholder="e.g., deep learning"),
96
- gr.Textbox(label="Start Date (YYYY-MM-DD)", placeholder="2022-01-01"),
97
- gr.Textbox(label="End Date (YYYY-MM-DD)", placeholder="2022-12-31"),
98
- gr.Radio(choices=["markdown", "pdf"], label="Output Format", value="markdown")
99
- ],
100
- outputs=gr.outputs.Textbox(label="Result (Markdown or PDF File Path)"),
101
- title="ArXiv Paper Summarizer",
102
- description=(
103
- "Enter a topic and a date range to fetch and summarize ArXiv papers. "
104
- "The output can be returned as markdown text or as a downloadable PDF file."
105
- ),
106
- )
107
-
108
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
 
 
2
  import requests
3
+ import xml.etree.ElementTree as ET
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ from fpdf import FPDF
7
+ import os
8
  import gradio as gr
9
+ from tqdm import tqdm
10
+
11
+ # Load Hugging Face Summarization Pipeline
12
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
13
+
14
+ def summarise_paper(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  try:
16
+ summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
17
+ return summary[0]['summary_text']
18
+ except Exception as e:
19
+ return f"Error summarizing: {str(e)}"
20
+
21
+ def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date):
22
+ url = "https://export.arxiv.org/api/query"
 
23
  params = {
24
+ "search_query": f'all:%22{topic}%22',
25
  "start": 0,
26
+ "max_results": 100,
27
  "sortBy": "submittedDate",
28
+ "sortOrder": "descending",
29
  }
30
+
31
+ response = requests.get(url, params=params)
32
+ if response.status_code == 200:
33
+ root = ET.fromstring(response.content)
34
+ entries = root.findall('{http://www.w3.org/2005/Atom}entry')
35
+
36
+ start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
37
+ end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")
38
+
39
+ data = []
40
+ for entry in entries:
41
+ publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text
42
+ publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d")
43
+
44
+ if start_date_dt <= publication_date_dt <= end_date_dt:
45
+ title = entry.find('{http://www.w3.org/2005/Atom}title').text
46
+ authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
47
+ abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text
48
+ link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf"
49
+
50
+ data.append({
51
+ "Title": title,
52
+ "Authors": authors,
53
+ "Abstract": abstract,
54
+ "Published Date": publication_date,
55
+ "Link": link,
56
+ })
57
+
58
+ return pd.DataFrame(data) if data else pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  else:
60
+ print(f"Error: {response.status_code}")
61
+ return pd.DataFrame()
62
+
63
+ def add_summary_column(df):
64
+ tqdm.pandas(desc="Summarizing Papers")
65
+ df['Summary'] = df['Abstract'].progress_apply(summarise_paper)
66
+ return df
67
+
68
+ class PDF(FPDF):
69
+ def header(self):
70
+ self.set_font('Arial', 'B', 12)
71
+ self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C')
72
+ self.ln(10)
73
+
74
+ def footer(self):
75
+ self.set_y(-15)
76
+ self.set_font('Arial', 'I', 8)
77
+ self.cell(0, 10, f'Page {self.page_no()}', align='C')
78
+
79
+ def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"):
80
+ os.makedirs(output_dir, exist_ok=True)
81
+ markdown_file = os.path.join(output_dir, f"{output_name}.md")
82
+ pdf_file = os.path.join(output_dir, f"{output_name}.pdf")
83
+
84
+ with open(markdown_file, "w", encoding="utf-8") as md_file:
85
+ md_file.write("# Research Paper Summaries\n\n")
86
+ for _, row in df.iterrows():
87
+ md_file.write(f"## {row['Title']}\n\n")
88
+ md_file.write(f"**Authors**: {row['Authors']}\n\n")
89
+ md_file.write(f"**Publication Date**: {row['Published Date']}\n\n")
90
+ md_file.write("### Summary\n")
91
+ md_file.write(f"{row['Summary']}\n\n")
92
+ md_file.write("---\n\n")
93
+
94
+ pdf = PDF()
95
+ pdf.add_page()
96
+ pdf.set_font('Arial', size=12)
97
+ pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C')
98
+ pdf.ln(10)
99
+
100
+ for _, row in df.iterrows():
101
+ title = row['Title']
102
+ authors = row['Authors']
103
+ publication_date = row['Published Date']
104
+ summary = row['Summary']
105
+
106
+ pdf.set_font('Arial', 'B', size=12)
107
+ pdf.multi_cell(0, 10, f"Title: {title}\n".encode('latin-1', 'replace').decode('latin-1'))
108
+ pdf.set_font('Arial', size=10)
109
+ pdf.multi_cell(0, 10, f"Authors: {authors}\n".encode('latin-1', 'replace').decode('latin-1'))
110
+ pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n".encode('latin-1', 'replace').decode('latin-1'))
111
+ pdf.set_font('Arial', size=10)
112
+ pdf.multi_cell(0, 10, "Summary:\n", align="L")
113
+ pdf.multi_cell(0, 10, f"{summary}\n".encode('latin-1', 'replace').decode('latin-1'))
114
+ pdf.ln(5)
115
+ pdf.cell(0, 5, "-" * 100, ln=True)
116
+ pdf.ln(5)
117
+
118
+ pdf.output(pdf_file)
119
+ return markdown_file, pdf_file
120
+
121
+
122
+ def fetch_and_summarize(topic, start_date, end_date, export_type):
123
+ df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date)
124
+ df_with_summary = add_summary_column(df)
125
+ markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary)
126
+ file_path = pdf_path if export_type == 'PDF' else markdown_path
127
+ return df_with_summary[['Title', 'Authors', 'Published Date']], file_path
128
+
129
+ with gr.Blocks(theme=gr.themes.Glass()) as demo:
130
+ topic = gr.Textbox(label="Topic")
131
+ start_date = gr.Textbox(label="Start Date (YYYY-MM-DD)")
132
+ end_date = gr.Textbox(label="End Date (YYYY-MM-DD)")
133
+ export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type")
134
+ output_table = gr.Dataframe(label="Summarized Papers")
135
+ output_file = gr.File(label="Download Summary")
136
+ fetch_button = gr.Button("Fetch and Summarize")
137
+ fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file])
138
+
139
+ if __name__ == "__main__":
140
+ demo.launch(show_error=True, debug=True, share=True)