Spaces:

ahn1305
/

universalwebscraper

Running

File size: 3,512 Bytes

c54a725

import gradio as gr
import os
import time
import json
import requests
from firecrawl import FirecrawlApp

# Configuration
API_KEY = "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"
FIRECRAWL_API_KEY = "fc-b9c9f8db590f4ea99b122d93eaf5160b"
API_URL = "https://openrouter.ai/api/v1/chat/completions"
SCRAPED_DATA_DIR = "scraped_data"
HARDCODED_DATA_FILE = "innotechtitans_data.json"
os.makedirs(SCRAPED_DATA_DIR, exist_ok=True)

# Function to scrape website data
def scrape_data(url):
    app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
    scraped_data = app.scrape_url(url, {'pageOptions': {'onlyMainContent': True}})
    
    if 'markdown' not in scraped_data:
        return "Error: Unable to scrape data."
    
    domain_name = url.split("//")[-1].split("/")[0]
    file_path = os.path.join(SCRAPED_DATA_DIR, f"{domain_name}.md")
    
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(scraped_data['markdown'])
    
    return f"✅ Scraped data saved as {domain_name}.md. Load it to proceed."

# Function to load data from a markdown file
def load_data(file_name):
    file_path = os.path.join(SCRAPED_DATA_DIR, file_name)
    if not os.path.exists(file_path):
        return "❌ Error: File not found."
    
    with open(file_path, 'r', encoding='utf-8') as file:
        global loaded_data
        loaded_data = file.read()
    
    return "✅ Data loaded successfully. You can now ask questions."

# Function to send a query to LLM
def ask_question(question):
    if not loaded_data:
        return "⚠️ No data loaded. Please scrape a website or load data first."
    
    headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": "deepseek/deepseek-chat:free",
        "messages": [{"role": "user", "content": f"{loaded_data}\n\n{question}"}]
    }
    
    response = requests.post(API_URL, json=payload, headers=headers)
    
    if response.status_code == 200:
        return response.json().get("choices", [{}])[0].get("message", {}).get("content", "No response.")
    return "❌ Error: Unable to generate response."

# Gradio Interface
with gr.Blocks(theme=gr.themes.Default()) as demo:
    gr.Markdown("""
        <h1 style='text-align: center;'>🕸️ Web Scraper & AI QnA</h1>
        <p style='text-align: center; font-size: 18px;'>Scrape websites and ask AI-powered questions!</p>
    """)
    
    with gr.Tab("Scrape Website"):
        with gr.Row():
            url_input = gr.Textbox(label="🌐 Website URL", placeholder="Enter URL to scrape")
            scrape_button = gr.Button("🚀 Scrape", variant="primary")
        scrape_output = gr.Markdown()
        scrape_button.click(scrape_data, inputs=[url_input], outputs=[scrape_output])
    
    with gr.Tab("Load Data"):
        with gr.Row():
            file_input = gr.Textbox(label="📂 Markdown File Name", placeholder="Enter filename (e.g., site.md)")
            load_button = gr.Button("📥 Load", variant="primary")
        load_output = gr.Markdown()
        load_button.click(load_data, inputs=[file_input], outputs=[load_output])
    
    with gr.Tab("Ask AI"):
        with gr.Row():
            question_input = gr.Textbox(label="❓ Ask a Question", placeholder="Ask based on loaded data")
            ask_button = gr.Button("💬 Ask", variant="primary")
        answer_output = gr.Markdown()
        ask_button.click(ask_question, inputs=[question_input], outputs=[answer_output])

demo.launch()