Spaces:

Agents-MCP-Hackathon
/

deepsearch

Sleeping

App Files Files Community

suchith83 commited on Jun 9

Commit

68b80a4

1 Parent(s): 2a82dce

research app

Browse files

Files changed (11) hide show

.gitignore +3 -0
README.md +44 -0
app.py +52 -36
requirements.txt +2 -1
research_agent.py +295 -0
tools/__init__.py +7 -0
tools/fetch.py +31 -0
tools/firecrawl_scrape.py +33 -0
tools/search.py +65 -0
tools/summarize.py +42 -0
tools/tool.py +15 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,6 @@
 venv/
 __pycache__/
 .env

 venv/
 __pycache__/
 .env
+tools/__pycache__
+.gradio/

README.md CHANGED Viewed

@@ -11,3 +11,47 @@ short_description: Searchs through web and returns related links
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Deep Research Assistant
+A Gradio web application that performs comprehensive research on any query using advanced AI models and web search capabilities.
+## Features
+- Interactive web interface using Gradio
+- Comprehensive research capabilities using multiple tools
+- Well-structured research reports with executive summaries, main findings, analysis, and sources
+- Support for a wide range of research topics
+## Setup
+1. Clone the repository
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Create a `.env` file in the root directory with your API key:
+   ```
+   CEREBRAS_API_KEY=your_api_key_here
+   ```
+## Running the Application
+1. Start the Gradio web interface:
+   ```bash
+   python app.py
+   ```
+2. Open your web browser and navigate to the URL shown in the terminal (typically http://localhost:7860)
+3. Enter your research query in the text box and click submit
+4. The application will generate a comprehensive research report based on your query
+## Usage Examples
+The application comes with built-in examples that you can try:
+- Latest developments in quantum computing
+- Current state of climate change and its impacts
+- Emerging trends in artificial intelligence
+## Note
+Make sure you have a valid Cerebras API key set in your environment variables. The application uses the Cerebras AI model for generating high-quality research reports.

app.py CHANGED Viewed

@@ -1,47 +1,63 @@
 import gradio as gr
 import os
-import requests
 from dotenv import load_dotenv
-load_dotenv(".env")
-API_KEY = os.getenv("GOOGLE_API_KEY")
-CSE_ID = os.getenv("GOOGLE_CSE_ID")
-def search_web(query):
-    if not API_KEY or not CSE_ID:
-        return "Missing API key or Search Engine ID in .env"
-    params = {
-        "q": query,
-        "key": API_KEY,
-        "cx": CSE_ID
-    }
-    try:
-        response = requests.get("https://www.googleapis.com/customsearch/v1", params=params)
-        response.raise_for_status()
-        data = response.json()
-        results = data.get("items", [])
-        if not results:
-            return "No results found."
-        formatted = ""
-        for i, result in enumerate(results[:3], 1):
-            title = result.get("title", "No Title")
-            link = result.get("link", "No Link")
-            snippet = result.get("snippet", "No Snippet")
-            formatted += f"**Result {i}**\n[{title}]({link})\n\n{snippet}\n\n---\n"
-        return formatted
     except Exception as e:
-        return f"Error: {str(e)}"
-# Gradio UI
-gr.Interface(
-    fn=search_web,
-    inputs=gr.Textbox(label="Search Query", placeholder="e.g. IPL 2025 predictions"),
-    outputs=gr.Markdown(label="Results"),
-    title="Google Search Tool",
-    description="Uses Google Custom Search API to fetch top 3 web results"
-).launch()

 import gradio as gr
+from research_agent import research
 import os
 from dotenv import load_dotenv
+import re
+# Load environment variables
+load_dotenv()
+def format_as_markdown(raw: str) -> str:
+    # 1. Remove <think>...</think> and everything inside
+    raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
+    # 2. Replace section headers with markdown equivalents
+    replacements = {
+        "[EXECUTIVE_SUMMARY]": "## Executive Summary",
+        "[MAIN_FINDINGS]": "## Main Findings",
+        "[ANALYSIS]": "## Analysis",
+        "[CONCLUSION]": "## Conclusion",
+        "[SOURCES]": "## Sources",
+    }
+    for tag, header in replacements.items():
+        raw = raw.replace(tag, f"\n\n{header}\n\n")
+    # 3. Optional: clean up extra whitespace
+    raw = re.sub(r"\n{3,}", "\n\n", raw).strip()
+    return raw
+def process_query(query: str) -> str:
+    """Process the user query and return research results."""
+    if not query.strip():
+        return "Please enter a valid query."
+    try:
+        result = research(query)
+        # print("returning result", result)
+        result = format_as_markdown(result)
+        return result
     except Exception as e:
+        return f"Error occurred: {str(e)}"
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_query,
+    inputs=gr.Textbox(
+        lines=3,
+        placeholder="Enter your research query here...",
+        label="Research Query"
+    ),
+    outputs=gr.Markdown(
+        label="Research Results"
+    ),
+    title="Deep Research Assistant",
+    description="Enter any query and get a comprehensive research report based on the latest information.",
+    examples=[
+        ["What are the latest developments in quantum computing?"],
+        ["Explain the current state of climate change and its impacts"],
+        ["What are the emerging trends in artificial intelligence?"]
+    ],
+    theme=gr.themes.Soft()
+).launch(mcp_server=True)

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ markdownify
 mcp[cli]
 httpx
 gradio[mcp]
-textblob

 mcp[cli]
 httpx
 gradio[mcp]
+textblob
+firecrawl-py

research_agent.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import os
+from typing import List, Dict, Any, Optional
+from openai import OpenAI
+import json
+from tools import SearchTool, FetchTool, SummarizeTool, FirecrawlScrapeTool
+from dotenv import load_dotenv
+from openai.types.chat import ChatCompletionMessage
+from openai.types.chat.chat_completion import ChatCompletion
+load_dotenv()
+def print_section(title: str, content: str):
+    """Print a section with a clear separator."""
+    print(f"\n{'='*80}")
+    print(f"{title}")
+    print(f"{'='*80}")
+    print(content)
+    print(f"{'='*80}\n")
+class PromptRefiner:
+    def __init__(self, client):
+        self.client = client
+        self.model = "qwen-3-32b"
+    def refine(self, query: str) -> str:
+        """Refine the user's query into a structured research prompt."""
+        #print_section("PROMPT REFINER", f"Original query: {query}")
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": """You are a "Prompt Architect" for a Deep Research Tool. Your job is to take an informal user query and turn it into a clear, comprehensive, and structured research prompt.
+Your output MUST follow this exact format:
+[RESEARCH_OBJECTIVE]
+A clear, single-sentence statement of what needs to be researched.
+[CONTEXT]
+- Domain/field of research
+- Required background knowledge
+- Any specific constraints or boundaries
+[KEY_QUESTIONS]
+1. First specific question to answer
+2. Second specific question to answer
+3. Third specific question to answer
+(Add more if needed)
+[OUTPUT_REQUIREMENTS]
+- Format (e.g., structured report, bullet points)
+- Depth of analysis
+- Required citations or sources
+- Length constraints
+[KEY_TERMS]
+- Term 1
+- Term 2
+- Term 3
+(Add more if needed)
+[CLARIFICATIONS_NEEDED]
+- Any questions that need to be asked to the user
+- Any assumptions made
+"""},
+                {"role": "user", "content": query}
+            ]
+        )
+        refined_query = response.choices[0].message.content
+        #print_section("REFINED QUERY", refined_query)
+        return refined_query
+class ResearcherAgent:
+    def __init__(self, client):
+        self.client = client
+        self.model = "qwen-3-32b"
+        self.tools = [
+            SearchTool(),
+            # FetchTool(),
+            SummarizeTool(),
+            FirecrawlScrapeTool()
+        ]
+        self.tools_json = [
+            {
+                "type": "function",
+                "function": tool.to_json()
+            }
+            for tool in self.tools
+        ]
+        self.tools_map = {tool.name: tool for tool in self.tools}
+    def research(self, query: str) -> str:
+        """Perform web research on the given query and return summarized findings."""
+        #print_section("RESEARCHER", f"Starting research on: {query}")
+        conversation_history = [
+            {"role": "system", "content": """You are a research agent that searches the web, reads contents of the urls, and summarizes findings.
+Use below tools if you think you are not up to date with the latest information:
+- search tool - to find relevant URLs
+- firecrawl_scrape tool - to get content from the most promising URLs in markdown format
+- summarize tool - to extract key information
+Organize findings in a clear, structured format
+Your final response should be a well-organized summary of all findings, with clear sections and bullet points where appropriate."""},
+            {"role": "user", "content": query}
+        ]
+        while True:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=conversation_history,
+                tools=self.tools_json,
+            )
+            message = response.choices[0].message
+            conversation_history.append({
+                "role": "assistant",
+                "content": message.content if message.content else "",
+                "tool_calls": message.tool_calls
+            })
+            if not message.tool_calls:
+                #print_section("RESEARCH FINDINGS", message.content or "No findings generated")
+                return message.content or "No findings generated"
+            tool_results = []
+            for tool_call in message.tool_calls:
+                tool_name = tool_call.function.name
+                arguments = json.loads(tool_call.function.arguments)
+                #print_section("TOOL CALL", f"Tool: {tool_name}\nArguments: {json.dumps(arguments, indent=2)}")
+                if tool_name not in self.tools_map:
+                    continue
+                tool = self.tools_map[tool_name]
+                result = tool(**arguments)
+                #print_section("TOOL RESULT", f"Tool: {tool_name}\nResult: {result}")
+                tool_results.append({
+                    "tool_call_id": tool_call.id,
+                    "role": "tool",
+                    "name": tool_name,
+                    "content": result
+                })
+            conversation_history.extend(tool_results)
+class PlannerAgent:
+    def __init__(self, client):
+        self.client = client
+        self.model = "qwen-3-32b"
+        self.scratchpad = ""
+        self.researcher = ResearcherAgent(client)
+    def plan(self, refined_query: str) -> str:
+        """Plan the research process and manage the scratchpad."""
+        #print_section("PLANNER", f"Starting research planning for:\n{refined_query}")
+        conversation_history = [
+            {"role": "system", "content": """
+You are a research planner that manages the research process.
+Your responses MUST follow this exact format:
+If you need more research:
+NEED_RESEARCH
+RESEARCH_QUERY: [specific query to research]
+REASON: [why this research is needed]
+If you have enough information:
+ENOUGH_INFORMATION
+SUMMARY: [brief summary of what we've learned]
+NEXT_STEPS: [what should be done with this information]
+Always evaluate:
+1. Have we answered all key questions from the research objective?
+2. Do we have enough depth and breadth of information?
+3. Are there any gaps in our understanding?
+4. Do we need to verify any information?
+Current date is 2025-06-04.
+"""},
+            {"role": "user", "content": f"Query: {refined_query}\nCurrent scratchpad:\n{self.scratchpad}"}
+        ]
+        while True:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=conversation_history
+            )
+            message = response.choices[0].message
+            #print_section("PLANNER DECISION", message.content)
+            conversation_history.append({"role": "assistant", "content": message.content})
+            # Parse the planner's decision
+            if "ENOUGH_INFORMATION" in message.content:
+                #print_section("PLANNER", "Research complete. Moving to report generation.")
+                return self.scratchpad
+            elif "NEED_RESEARCH" in message.content:
+                # Extract research query from the message
+                research_query = message.content.split("RESEARCH_QUERY:")[1].split("\n")[0].strip()
+                findings = self.researcher.research(research_query)
+                self.scratchpad += f"\n\nNew findings:\n{findings}"
+                #print_section("UPDATED SCRATCHPAD", self.scratchpad)
+                conversation_history.append({
+                    "role": "user",
+                    "content": f"Updated scratchpad:\n{self.scratchpad}"
+                })
+class ReporterAgent:
+    def __init__(self, client):
+        self.client = client
+        self.model = "qwen-3-32b"
+    def generate_report(self, scratchpad: str, original_query: str) -> str:
+        """Generate a final report based on the scratchpad content."""
+        #print_section("REPORTER", "Generating final report")
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": """You are a research reporter that generates clear, well-structured reports.
+Your report MUST follow this format:
+[EXECUTIVE_SUMMARY]
+A concise overview of the key findings and conclusions.
+[MAIN_FINDINGS]
+1. First major finding
+   - Supporting details
+   - Sources/references
+2. Second major finding
+   - Supporting details
+   - Sources/references
+(Add more as needed)
+[ANALYSIS]
+- Interpretation of the findings
+- Connections between different pieces of information
+- Implications or significance
+[CONCLUSION]
+- Summary of key takeaways
+- Any remaining questions or areas for further research
+[SOURCES]
+- List of all sources used in the research"""},
+                {"role": "user", "content": f"Original query: {original_query}\n\nResearch findings:\n{scratchpad}\n\nGenerate a comprehensive report that answers the original query."}
+            ]
+        )
+        report = response.choices[0].message.content
+        # #print_section("FINAL REPORT", report)
+        return report
+def research(query: str) -> str:
+    """Main research function that orchestrates the entire research process."""
+    try:
+        api_key = os.environ.get("CEREBRAS_API_KEY")
+        if not api_key:
+            return "Error: Please set CEREBRAS_API_KEY environment variable"
+        client = OpenAI(
+            base_url="https://api.cerebras.ai/v1",
+            api_key=api_key
+        )
+        # Step 1: Refine the prompt
+        refiner = PromptRefiner(client)
+        refined_query = refiner.refine(query)
+        # Step 2: Plan and execute research
+        planner = PlannerAgent(client)
+        scratchpad = planner.plan(refined_query)
+        # Step 3: Generate final report
+        reporter = ReporterAgent(client)
+        final_report = reporter.generate_report(scratchpad, query)
+        return final_report
+    except Exception as e:
+        return f"Error in research process: {str(e)}"
+# if __name__ == "__main__":
+#     while True:
+#         query = input("Enter your query: ")
+#         if query == "exit":
+#             break
+#         print(research(query))

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .search import SearchTool
+from .fetch import FetchTool
+from .summarize import SummarizeTool
+from .firecrawl_scrape import FirecrawlScrapeTool
+from .tool import Tool
+__all__ = ["SearchTool", "FetchTool", "SummarizeTool", "Tool", "FirecrawlScrapeTool"]

tools/fetch.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from .tool import Tool
+from markdownify import markdownify
+import requests
+class FetchTool(Tool):
+    def __init__(self):
+        super().__init__(
+            name="fetch",
+            description="Fetch the content of a URL and return the markdownified version of the content",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "url": {"type": "string", "description": "The URL to fetch"}
+                }
+            }
+        )
+    def __call__(self, url: str):
+        try:
+            if not url:
+                return "Error: URL parameter is required"
+            resp = requests.get(url)
+            resp.raise_for_status()  # Raise an exception for bad status codes
+            return markdownify(resp.text)
+        except requests.exceptions.RequestException as e:
+            return f"Error fetching URL: {str(e)}"
+        except Exception as e:
+            return f"Unexpected error while processing URL: {str(e)}"

tools/firecrawl_scrape.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from .tool import Tool
+from firecrawl import FirecrawlApp
+from dotenv import load_dotenv
+import os
+load_dotenv()
+class FirecrawlScrapeTool(Tool):
+    def __init__(self):
+        super().__init__(
+            name="firecrawl_scrape",
+            description="Scrape a website and return the markdownified version of the content",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "url": {"type": "string", "description": "The URL to scrape"}
+                }
+            }
+        )
+    def __call__(self, url: str):
+        try:
+            if not url:
+                return "Error: URL parameter is required"
+            app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
+            scrape_result = app.scrape_url(url, formats=['markdown', 'html'])
+            return scrape_result["data"]["markdown"]
+        except Exception as e:
+            return f"Error scraping URL: {str(e)}"

tools/search.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import requests
+from dotenv import load_dotenv
+import os
+from .tool import Tool
+load_dotenv("./.env")
+class SearchTool(Tool):
+    def __init__(self):
+        super().__init__(
+            name="search",
+            description="Search the web for information",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string", "description": "The search query"}
+                }
+            }
+        )
+        self.api_key = os.environ.get("GOOGLE_API_KEY")
+        self.search_engine_id = os.environ.get("GOOGLE_CSE_ID")
+        if not self.api_key:
+            raise ValueError("Please set GOOGLE_API_KEY environment variable")
+        if not self.search_engine_id:
+            raise ValueError("Please set GOOGLE_CSE_ID environment variable")
+    def __call__(self, query: str):
+        try:
+            if not query:
+                return "Error: Query parameter is required"
+            params = {
+                "q": query,
+                "key": self.api_key,
+                "cx": self.search_engine_id
+            }
+            resp = requests.get("https://www.googleapis.com/customsearch/v1", params=params)
+            resp.raise_for_status()  # Raise an exception for bad status codes
+            _results = resp.json().get("items", [])
+            results = []
+            for result in _results[:3]:
+                results.append({
+                    "title": result.get("title", "No title"),
+                    "link": result.get("link", "No link"),
+                    "snippet": result.get("snippet", "No snippet")
+                })
+            if not results:
+                return "No results found for the given query."
+            # Format results as a string
+            formatted_results = []
+            for i, result in enumerate(results, 1):
+                formatted_results.append(f"Result {i}:\nTitle: {result['title']}\nLink: {result['link']}\nSnippet: {result['snippet']}\n")
+            return "\n".join(formatted_results)
+        except requests.exceptions.RequestException as e:
+            return f"Error during search: {str(e)}"
+        except Exception as e:
+            return f"Unexpected error during search: {str(e)}"

tools/summarize.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from .tool import Tool
+from openai import OpenAI
+from dotenv import load_dotenv
+import os
+load_dotenv("./.env")
+class SummarizeTool(Tool):
+    def __init__(self):
+        super().__init__(
+            name="summarize",
+            description="Summarize the content of a URL",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "content": {"type": "string", "description": "The content to summarize"}
+                }
+            }
+        )
+        api_key = os.environ.get("CEREBRAS_API_KEY")
+        if not api_key:
+            raise ValueError("Please set CEREBRAS_API_KEY environment variable")
+        self.client = OpenAI(base_url="https://api.cerebras.ai/v1", api_key=api_key)
+    def __call__(self, **kwargs):
+        try:
+            content = kwargs.get("content")
+            if not content:
+                return "Error: Content parameter is required"
+            response = self.client.chat.completions.create(
+                model="qwen-3-32b",
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant that summarizes content while keeping the all important information."},
+                    {"role": "user", "content": content}
+                ]
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            return f"Error during summarization: {str(e)}"

tools/tool.py ADDED Viewed

	@@ -0,0 +1,15 @@

+class Tool:
+    def __init__(self, name: str, description: str, inputSchema: dict):
+        self.name = name
+        self.description = description
+        self.inputSchema = inputSchema
+    def __repr__(self):
+        return f"Tool(name={self.name}, description={self.description}, inputSchema={self.inputSchema})"
+    def to_json(self):
+        return {
+            "name": self.name,
+            "description": self.description,
+            "parameters": self.inputSchema
+        }