Spaces:

ResearchMAGIC
/

the-big-scraper

Sleeping

App Files Files Community

rodrigomasini commited on Aug 15, 2024

Commit

c63697d

verified ·

1 Parent(s): abdffe5

Create alternative.py

Browse files

Files changed (1) hide show

alternative.py +196 -0

alternative.py ADDED Viewed

	@@ -0,0 +1,196 @@

+###############################################################################################################################################################
+#                                         _____  _           ___  _         ___
+#                                        |_   _|| |_   ___  | _ )(_) __ _  / __| __  _ _  __ _  _ __  ___  _ _
+#                                          | |  | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
+#                                          |_|  |_||_|\___| |___/|_|\__, | |___/\__||_|  \__,_|| .__/\___||_|
+#                                                                   |___/                      |_|
+#
+##############################################################################################################################################################
+#                          _                         ______              _         _                  _______               _         _
+#                     _   | |                       (_____ \            | |       (_)                (_______)             (_)       (_)
+#     _____  _   _  _| |_ | |__    ___    ____  _    _____) )  ___    __| |  ____  _   ____   ___     _  _  _  _____   ___  _  ____   _
+#    (____ || | | |(_   _)|  _ \  / _ \  / ___)(_)  |  __  /  / _ \  / _  | / ___)| | / _  | / _ \   | ||_|| |(____ | /___)| ||  _ \ | |
+#    / ___ || |_| |  | |_ | | | || |_| || |     _   | |  \ \ | |_| |( (_| || |    | |( (_| || |_| |  | |   | |/ ___ ||___ || || | | || |
+#    \_____||____/    \__)|_| |_| \___/ |_|    (_)  |_|   |_| \___/  \____||_|    |_| \___ | \___/   |_|   |_|\_____|(___/ |_||_| |_||_|
+#                                                                                    (_____|
+###############################################################################################################################################################
+#
+# Last updated in: 8/15/2024
+#
+###############################################################################################################################################################
+# ------------------------------------------------------------------------------
+# IMPORTS
+# ------------------------------------------------------------------------------
+import os
+import subprocess
+from typing import Tuple
+import gradio as gr
+from dotenv import load_dotenv
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from langchain_community.llms import HuggingFaceEndpoint
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+from bs4 import BeautifulSoup as Soup
+from langchain_community.document_loaders import (AsyncHtmlLoader,
+                                               NewsURLLoader, PubMedLoader,
+                                               PlaywrightURLLoader,
+                                               RecursiveUrlLoader,
+                                               SeleniumURLLoader,
+                                               UnstructuredURLLoader,
+                                               WebBaseLoader)
+# ------------------------------------------------------------------------------
+# DEV ENVIRONMENT SETUP
+# ------------------------------------------------------------------------------
+# Load environment variables
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+# Initialize the model instances
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+llm_model_instance = HuggingFaceEndpoint(
+    repo_id=repo_id, max_length=8192, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "embeddings": {"model_instance": embedder_model_instance}
+}
+# ------------------------------------------------------------------------------
+# THE BIG SCRAPER
+# ------------------------------------------------------------------------------
+def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
+    """Extracts data from provided URLs using specified loader type.
+    Args:
+        urls (str): Comma-separated URLs to extract data from.
+        loader_type (str): Type of loader to use for data extraction.
+    Returns:
+        tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
+               Returns error messages if an exception occurs.
+    """
+    try:
+        urls = urls.split(',')
+        data = []
+        if loader_type == 'AsyncHtmlLoader':
+            loader = AsyncHtmlLoader(urls)
+        elif loader_type == 'UnstructuredURL':
+            loader = UnstructuredURLLoader(urls=urls)
+        elif loader_type == 'RecursiveURL':
+            loader = RecursiveUrlLoader(
+                url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
+            )
+        elif loader_type == 'SeleniumURL':
+            loader = SeleniumURLLoader(urls=urls)
+        elif loader_type == 'SeleniumURLH':
+            loader = SeleniumURLLoader(urls=urls, headless=False)
+        elif loader_type == 'PlaywrightURL':
+            loader = PlaywrightURLLoader(urls=urls)
+        elif loader_type == 'PubMed':
+            loader = PubMedLoader(urls[0])
+        elif loader_type == 'NewsURL':
+            loader = NewsURLLoader(urls)
+        elif loader_type == 'WebBaseLoader':
+            loader = WebBaseLoader(urls)
+        else:
+            return "Not Implemented. Development in Progress", "Work In Progress"
+        data = loader.load()
+        jsonData = []
+        for item in data:
+            jsonData.append(item.to_json())
+        return jsonData, data
+    except Exception as err:
+        return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"
+def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
+    """Scrapes website content based on the prompt and summarizes it.
+    Args:
+        prompt (str): The prompt to guide the scraping process.
+        source (str): The URL of the website to scrape.
+    Returns:
+        tuple: A tuple containing the scraped data as a dictionary and the execution information.
+    """
+    smart_scraper_graph = SmartScraperGraph(
+        prompt=prompt,
+        source=source,
+        config=graph_config
+    )
+    result = smart_scraper_graph.run()
+    exec_info = smart_scraper_graph.get_execution_info()
+    return result, prettify_exec_info(exec_info)
+# ------------------------------------------------------------------------------
+# TABBED GRADIO UI
+# ------------------------------------------------------------------------------
+# Define choices for the dropdown menu
+choices = [
+    'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
+    'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
+    'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
+]
+# Create the Gradio interface with tabs
+with gr.Blocks() as demo:
+    gr.Markdown("# Web Scraping and Summarization")
+    with gr.Tabs():
+        # Tab 1: Data Extraction
+        with gr.TabItem("Data Extraction"):
+            gr.Markdown("## Extract data from URLs using various loaders")
+            with gr.Row():
+                url_input = gr.Textbox(label="Enter your comma separated URLs here")
+                loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
+            extract_button = gr.Button("Extract Data")
+            with gr.Row():
+                extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
+                extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
+            extract_button.click(
+                extractDataFromUrls,
+                inputs=[url_input, loader_dropdown],
+                outputs=[extracted_data_json, extracted_data_text]
+            )
+        # Tab 2: Website Scraping and Summarization
+        with gr.TabItem("Website Scraping & Summarization"):
+            gr.Markdown("# Scrape websites, no-code version")
+            gr.Markdown("""Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started.
+                        This is a no-code version of the excellent lib [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai).
+                        It's a basic demo and a work in progress. Please contribute to it to make it more useful!""")
+            with gr.Row():
+                with gr.Column():
+                    model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
+                    prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
+                    source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
+                    scrape_button = gr.Button("Scrape and Summarize")
+                with gr.Column():
+                    result_output = gr.JSON(label="Result")
+                    exec_info_output = gr.Textbox(label="Execution Info")
+            scrape_button.click(
+                scrapeAndSummarize,
+                inputs=[prompt_input, source_input],
+                outputs=[result_output, exec_info_output]
+            )
+# Launch the Gradio interface
+demo.launch()