Spaces:

buelfhood
/

Arxiv_ret_sum

Running

App Files Files Community

buelfhood commited on 17 days ago

Commit

74f32ba

verified ·

1 Parent(s): fc0040b

Create app.py

Browse files

Files changed (1) hide show

app.py +108 -0

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import datetime
+import os
+import tempfile
+import requests
+import feedparser
+import gradio as gr
+from transformers import pipeline
+import pdfkit
+# Initialize the summarizer (this may download the model on first run)
+summarizer = pipeline("summarization")
+def summarize_papers(topic, start_date, end_date, output_format):
+    """
+    Fetch papers from arXiv via its direct API based on a topic and date range,
+    summarize their abstracts, and return the result as either a markdown string
+    or a PDF file.
+    Parameters:
+      topic (str): The subject/topic to search for.
+      start_date (str): Start date in 'YYYY-MM-DD' format.
+      end_date (str): End date in 'YYYY-MM-DD' format.
+      output_format (str): "markdown" or "pdf".
+    Returns:
+      str: Markdown text or file path to the generated PDF.
+    """
+    try:
+        start_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
+        end_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
+    except ValueError:
+        return "Invalid date format. Please use YYYY-MM-DD."
+    # Build the API query URL using the direct ArXiv API.
+    base_url = "http://export.arxiv.org/api/query"
+    search_query = "all:" + topic
+    params = {
+        "search_query": search_query,
+        "start": 0,
+        "max_results": 50,  # Adjust as needed
+        "sortBy": "submittedDate",
+        "sortOrder": "descending"
+    }
+    # Fetch the XML feed from ArXiv
+    response = requests.get(base_url, params=params)
+    feed = feedparser.parse(response.text)
+    markdown_output = f"# ArXiv Papers on '{topic}'\n\n"
+    found = False
+    for entry in feed.entries:
+        # Parse the published date (e.g., '2022-01-15T18:00:00Z')
+        entry_date = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
+        if start_dt <= entry_date <= end_dt:
+            found = True
+            # Summarize the abstract using Hugging Face pipeline
+            try:
+                summary_text = summarizer(entry.summary, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
+            except Exception as e:
+                summary_text = "Error in summarization."
+            # Build markdown content for each paper
+            markdown_output += f"## {entry.title}\n"
+            # entry.authors is a list of dicts with a 'name' field
+            authors = ", ".join([author.name for author in entry.authors])
+            markdown_output += f"**Authors:** {authors}\n\n"
+            markdown_output += f"**Published:** {entry_date.strftime('%Y-%m-%d')}\n\n"
+            markdown_output += f"**Summary:** {summary_text}\n\n"
+            markdown_output += f"**Link:** [arXiv]({entry.id})\n\n"
+            markdown_output += "---\n\n"
+    if not found:
+        markdown_output += "\n_No papers found within the specified date range._\n"
+    if output_format == "markdown":
+        return markdown_output
+    elif output_format == "pdf":
+        # Convert markdown to a simple HTML wrapper for PDF conversion.
+        html_content = "<html><body>" + markdown_output.replace("\n", "<br>") + "</body></html>"
+        temp_dir = tempfile.gettempdir()
+        pdf_path = os.path.join(temp_dir, "arxiv_summary.pdf")
+        try:
+            pdfkit.from_string(html_content, pdf_path)
+            return pdf_path  # Gradio will treat this as a downloadable file.
+        except Exception as e:
+            return f"PDF conversion failed: {e}"
+    else:
+        return "Unsupported output format."
+# Build the Gradio interface.
+iface = gr.Interface(
+    fn=summarize_papers,
+    inputs=[
+        gr.Textbox(label="Topic", placeholder="e.g., deep learning"),
+        gr.Textbox(label="Start Date (YYYY-MM-DD)", placeholder="2022-01-01"),
+        gr.Textbox(label="End Date (YYYY-MM-DD)", placeholder="2022-12-31"),
+        gr.Radio(choices=["markdown", "pdf"], label="Output Format", value="markdown")
+    ],
+    outputs=gr.outputs.Textbox(label="Result (Markdown or PDF File Path)"),
+    title="ArXiv Paper Summarizer",
+    description=(
+        "Enter a topic and a date range to fetch and summarize ArXiv papers. "
+        "The output can be returned as markdown text or as a downloadable PDF file."
+    ),
+)
+iface.launch()