Spaces:

bacancydataprophets
/

agent_insurance_scrapper

Sleeping

App Files Files Community

gneya-bacancy commited on Apr 2

Commit

03c34b1

verified ·

1 Parent(s): 70e4c96

Upload 8 files

Browse files

Files changed (8) hide show

business.py +10 -0
businesses_data.csv +15 -0
config.py +45 -0
index.py +14 -0
main.py +173 -0
requirements.txt +3 -0
scraper.py +170 -0
utils.py +19 -0

business.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pydantic import BaseModel, Field
+class BusinessData(BaseModel):
+    name: str = Field(..., description="The name of the business or entity.")
+    price_rate: str = Field(..., description="The address of the business or entity.")
+    website: str = Field(..., description="The website URL of the business or entity.")
+    benefits: str = Field(..., description="The phone number of the business or entity.")
+    inclusions: str = Field(..., description="The inclusions of the business or entity.")
+    description: str = Field(..., description="A brief description of the business or entity.")

businesses_data.csv ADDED Viewed

	@@ -0,0 +1,15 @@

+name,price_rate,website,benefits,inclusions,description
+Bajaj Allianz General Insurance Company,,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,,,Bajaj Allianz health insurance offers a wide range of health insurance plans catering to the varying health needs of the public.
+Bajaj Allianz,,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,,,Bajaj Allianz offers a variety of health insurance plans designed to provide comprehensive coverage for medical expenses.
+Bajaj Allianz Health Insurance,N/A,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,1800-258-5881,N/A,Bajaj Allianz Health Insurance offers a range of health insurance plans to cover medical expenses and provide financial security.
+Bajaj Allianz M-Care Plan,"Rs 10,000 to Rs 75,000",https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Lump sum benefit is paid on the first diagnosis of the disease.,A waiting period of 15 days is applicable.,Bajaj Allianz M-Care is a customized plan designed to provide coverage against 7 vector-borne diseases.
+Bajaj Allianz Loan Care Plan,60 times the monthly income or the loan amount,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"Personal accident cover, including accidental death & permanent total disability.",Cost of transportation of the mortal remains is also covered.,The Bajaj Allianz Loan Care plan protects the insured from the burden of repaying a loan in case of any unfortunate incident.
+Bajaj Allianz Global Personal Guard Plan,Up to 100 times the gross monthly income,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Accidental death and permanent total/partial disability are covered.,"Road ambulance, air ambulance and travel expenses are covered.",The Bajaj Allianz Global Personal Guard plan offers personal accident coverage to the insured across the globe.
+Bajaj Allianz My Health Care Plan,Rs 1 lakh to Rs 5 crore,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"In-patient hospitalization, OPD treatments and AYUSH treatments are covered.","Maternity expenses, including surrogacy and ART complications, and baby care are covered.",The Bajaj Allianz My Health Care plan is a mix of a basic hospitalization plan and a super top-up plan that provides all-round medical protection.
+Bajaj Allianz Personal Accident Plan,Up to 120 times the insured’s average monthly income,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Coverage for accidental death and permanent total disability are available.,Accidental medical expenses and hospital confinement allowance are available as optional covers.,The Bajaj Allianz Personal Accident plan covers the insured against any health uncertainties arising out of an accident.
+Bajaj Allianz Hospital Cash Daily Allowance Plan,Rs 500 to Rs 2500 per day,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Daily allowance is paid for each day of hospitalization.,Double the cash amount is paid in case of ICU hospitalization.,The Bajaj Allianz Hospital Cash Daily Allowance plan covers the incidental expenses incurred during the hospitalization of the insured.
+Bajaj Allianz Tax Gain Plan,Rs 1 lakh to Rs 3 lakh,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"OPD expenses, day care procedures and hospitalization expenses are covered.",Free health check-ups are available after every 4 claim-free years.,The Bajaj Allianz Tax Gain plan provides coverage for hospitalization and outpatient expenses of an entire family on a floater basis.
+Bajaj Allianz Global Health Care Plan,"Domestic: Rs 37.5 lakh to Rs 3.75 crore; International: USD 100,000 to USD 1,000,000",https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Cost of in-patient treatments and pre-post hospitalization is covered within India and abroad.,Annual preventive health check-ups within India are available.,The Bajaj Allianz Global Health Care plan provides comprehensive coverage against medical expenses incurred within India and abroad.
+Bajaj Allianz Critical Illness Plan,Rs 1 lakh to Rs 5 lakh,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Lump sum payment is made on the diagnosis of the critical illnesses.,A survival period of 30 days is applicable.,The Bajaj Allianz Critical Illness plan offers financial support to the insured for the treatment of 10 critical illnesses.
+Bajaj Allianz Health Ensure Plan,"Rs 50,000 to Rs 10 lakh",https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"Coverage for in-patient treatment, AYUSH treatment and day care procedures are available.",Preventive health check-up facilities are available after every 3 continuous renewals.,The Bajaj Allianz Health Ensure plan offers comprehensive coverage to the insured for medical expenses incurred due to an illness or injury.
+Bajaj Allianz Sankat Mochan Plan,Up to 120 times the average monthly income,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Coverage for death and permanent total disability due to an accident are available.,Hospital confinement allowance for up to 30 days per policy year is available.,The Bajaj Allianz Sankat Mochan plan covers the insured against any accidents that may lead to a major financial burden.

config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+# Specify the LLM model to use. You can choose any LLM supported by LiteLLM.
+# Example options include "gpt-4o", "claude", "deepseek-chat", etc.
+# For a full list of supported models, refer to:
+# https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
+#LLM_MODEL = "groq/llama3-8b-8192"
+#LLM_MODEL= "gemini-2.5-pro-exp-03-25"
+LLM_MODEL ="openai/gpt-4o-mini"
+# API token for authentication with the LLM provider.
+# This is fetched from the environment variable "GEMINI_API_KEY".
+#PI_TOKEN = os.getenv("GEMINI_API_KEY")
+#API_TOKEN="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"
+#API_TOKEN="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"
+import os
+API_TOKEN= os.getenv("OPENAI_API_KEY")
+# Base URL of the website to scrape.
+# In this example, we are scraping Yellow Pages for dentists in Toronto, ON.
+# You can modify the URL to change the location or the type of business.
+# Example:
+# - For plumbers in Vancouver: "https://www.yellowpages.ca/search/si/{page_number}/Plumbers/Vancouver+BC"
+# - For restaurants in Montreal: "https://www.yellowpages.ca/search/si/{page_number}/Restaurants/Montreal+QC"
+BASE_URL = "https://gentledental.ai/"
+# CSS selector to target the main HTML element containing the business information.
+# This is specific to Yellow Pages and helps focus the scraper on relevant content
+# instead of sending the entire HTML page to the LLM.
+# CSS_SELECTOR = "[class^='listing_right_section']"
+CSS_SELECTOR = ""
+# Maximum number of pages to crawl. Adjust this value based on how much data you want to scrape.
+MAX_PAGES = 3  # Example: Set to 5 to scrape 5 pages.
+# Instructions for the LLM on what information to extract from the scraped content.
+# The LLM will extract the following details for each business:
+# - Name
+# - Address
+# - Website
+# - Phone number
+# - A one-sentence description
+SCRAPER_INSTRUCTIONS = (
+    "Extract all business information: 'name', 'address', 'website'"
+    ", 'phone number' and a one-sentence 'description' from the following content."
+)

index.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+from main import main
+import asyncio
+# Title
+st.title("Agent")
+# Text input
+user_input = st.text_input("Input:")
+# Button
+if st.button("Submit"):
+    with st.spinner("Processing... Please wait."):
+        result = asyncio.run(main(user_input))
+        st.write(result)

main.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import asyncio
+from crawl4ai import AsyncWebCrawler
+from dotenv import load_dotenv
+from config import API_TOKEN, LLM_MODEL
+from config import BASE_URL, CSS_SELECTOR, MAX_PAGES, SCRAPER_INSTRUCTIONS
+from utils import save_data_to_csv
+from scraper import (
+    get_browser_config,
+    get_llm_strategy,
+    fetch_and_process_page
+)
+import os
+from langchain.agents import AgentExecutor, create_openai_tools_agent
+from langchain_groq import ChatGroq
+from langchain.tools import BaseTool, StructuredTool, tool
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from business import BusinessData
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from langchain_groq import ChatGroq
+from langchain.agents import AgentExecutor, create_openai_tools_agent
+from langchain_community.utilities import GoogleSerperAPIWrapper
+load_dotenv()
+@tool
+def search(query):
+  """Search for links using the Google Serper API."""
+  print("Searching for links...")
+  serp_tool= GoogleSerperAPIWrapper(
+          serper_api_key="76238dbeac9defaae61715ba4e928a42c0e98e6e")
+  data = serp_tool.results(query)
+  links = []
+  # Extract links from the 'organic' results
+  if 'organic' in data:
+    for result in data['organic']:
+      if 'link' in result:
+        links.append(result['link'])
+  # Extract links from 'sitelinks' within organic results
+  if 'organic' in data:
+    for result in data['organic']:
+      if 'sitelinks' in result:
+        for sitelink in result['sitelinks']:
+          if 'link' in sitelink:
+            links.append(sitelink['link'])
+  # Extract links from 'peopleAlsoAsk'
+  if 'peopleAlsoAsk' in data:
+    for item in data['peopleAlsoAsk']:
+      if 'link' in item:
+        links.append(item['link'])
+  print(links)
+  return links[:5]
+@tool
+async def scrape(url):
+    """
+    Function to scrape data once search tool is called and url is available.
+    """
+    # Initialize configurations
+    browser_config = get_browser_config()
+    llm_strategy = get_llm_strategy(
+        llm_instructions=SCRAPER_INSTRUCTIONS,  # Instructions for the LLM
+        output_format=BusinessData # Data output format
+    )
+    session_id = "crawler_session"
+    # Initialize state variables
+    page_number = 1
+    all_records = []
+    seen_names = set()
+    # Start the web crawler context
+    # https://docs.crawl4ai.com/api/async-webcrawler/#asyncwebcrawler
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        while True:
+            # Fetch and process data from the current page
+            records, no_results_found = await fetch_and_process_page(
+                crawler,
+                page_number,
+                url,
+                CSS_SELECTOR,
+                llm_strategy,
+                session_id,
+                seen_names,
+            )
+            if no_results_found:
+                print("No more records found. Ending crawl.")
+                break  # Stop crawling when "No Results Found" message appears
+            if not records:
+                print(f"No records extracted from page {page_number}.")
+                break  # Stop if no records are extracted
+            # Add the records from this page to the total list
+            all_records.extend(records)
+            page_number += 1  # Move to the next
+            if page_number > MAX_PAGES:
+                break
+            # Pause between requests to avoid rate limits
+            await asyncio.sleep(2)  # Adjust sleep time as needed
+    # Save the collected records to a CSV file
+    if all_records:
+        save_data_to_csv(
+            records=all_records,
+            data_struct=BusinessData,
+            filename="businesses_data.csv"
+        )
+    else:
+        print("No records were found during the crawl.")
+    # Display usage statistics for the LLM strategy
+    llm_strategy.show_usage()
+    return all_records
+async def main(input):
+    prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", """"You are a web scraping assistant.
+        Your have task to do:
+        - If the user_input has sites mentioned then directly use the scrapping tool to scrape those data
+        - Once the search tool is called then only call scrape tool
+        - If the user_input has no sites mentioned then use the search tool to get the sites and then use the scrapping tool to scrape those data
+        - Scrape the website and extract the business information.
+        - Information such as plan name, price, inclusions, benefits and every minor details of the plan has to be included.
+        - If the required data is not specified, create the query for search tool to find that data and again find the websites that might have infromation then use the scrape tool to get the required information.
+        - For example if the price of health and life insurance is not specificed, create the search query for such as user_input price and then search and scrape.
+        - Continue till every infromation of the plan is not convered. Such as plan name, it's price, inclusions, benefits and every minor details.
+        - Focus on one plan get all the required details of that plan and then move to the next plan.
+        - Information such as plans and all the details of the plans is to be fetched.
+        - Return information in Table format only. Use markdowns and return the data in table format.
+        """),
+        MessagesPlaceholder("chat_history", optional=True),
+        ("human", "User_input: {input}"),
+        MessagesPlaceholder("agent_scratchpad"),
+    ]
+)
+    tools = [scrape,search]
+    # model  = "llama3-8b-8192"
+    # llm = ChatGroq(api_key=API_TOKEN, model=model)
+    model = "gpt-4o-mini"
+    api_key = os.getenv("OPENAI_API_KEY")
+    llm = ChatOpenAI(api_key=api_key, model=model, temperature=0.0)
+    """
+    Entry point of the script.
+    """
+    # llm_tools=llm.bind_tools(tools)
+    # print(llm_tools.invoke("Scrape https://example.com/ for me"))
+    agent = create_openai_tools_agent(llm, tools, prompt)
+# Create an agent executor by passing in the agent and tools
+    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
+    result = await agent_executor.ainvoke({"input": input})
+    return result['output']
+    print(result['output'])
+    # url = input()
+    # await crawl_yellowpages(url)
+if __name__ == "__main__":
+    asyncio.run(main())

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+Crawl4AI==0.4.247
+python-dotenv==1.0.1
+pydantic==2.10.6

scraper.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import json
+from pydantic import BaseModel
+from typing import List, Set, Tuple
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CacheMode,
+    CrawlerRunConfig,
+    LLMExtractionStrategy,
+)
+from utils import is_duplicated
+from config import LLM_MODEL, API_TOKEN
+def get_browser_config() -> BrowserConfig:
+    """
+    Returns the browser configuration for the crawler.
+    Returns:
+        BrowserConfig: The configuration settings for the browser.
+    """
+    # https://docs.crawl4ai.com/core/browser-crawler-config/
+    return BrowserConfig(
+        browser_type="chromium",  # Type of browser to simulate
+        headless=True,  # Whether to run in headless mode (no GUI)
+        verbose=True,  # Enable verbose logging
+    )
+def get_llm_strategy(llm_instructions: str, output_format: BaseModel) -> LLMExtractionStrategy:
+    """
+    Returns the configuration for the language model extraction strategy.
+    Returns:
+        LLMExtractionStrategy: The settings for how to extract data using LLM.
+    """
+    # https://docs.crawl4ai.com/api/strategies/#llmextractionstrategy
+    return LLMExtractionStrategy(
+        provider=LLM_MODEL,  # Name of the LLM provider
+        api_token=API_TOKEN,  # API token for authentication
+        schema=output_format.model_json_schema(),  # JSON schema of the data model
+        extraction_type="schema",  # Type of extraction to perform
+        instruction=llm_instructions,  # Instructions for the LLM
+        input_format="markdown",  # Format of the input content
+        verbose=True,  # Enable verbose logging
+    )
+async def check_no_results(
+    crawler: AsyncWebCrawler,
+    url: str,
+    session_id: str,
+) -> bool:
+    """
+    Checks if the "No Results Found" message is present on the page.
+    Args:
+        crawler (AsyncWebCrawler): The web crawler instance.
+        url (str): The URL to check.
+        session_id (str): The session identifier.
+    Returns:
+        bool: True if "No Results Found" message is found, False otherwise.
+    """
+    # Fetch the page without any CSS selector or extraction strategy
+    result = await crawler.arun(
+        url=url,
+        config=CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            session_id=session_id,
+        ),
+    )
+    if result.success:
+        if "No Results Found" in result.cleaned_html:
+            return True
+    else:
+        print(
+            f"Error fetching page for 'No Results Found' check: {result.error_message}"
+        )
+    return False
+async def fetch_and_process_page(
+    crawler: AsyncWebCrawler,
+    page_number: int,
+    base_url: str,
+    css_selector: str,
+    llm_strategy: LLMExtractionStrategy,
+    session_id: str,
+    seen_names: Set[str],
+) -> Tuple[List[dict], bool]:
+    """
+    Fetches and processes a single page from yellowpages.
+    Args:
+        crawler (AsyncWebCrawler): The web crawler instance.
+        page_number (int): The page number to fetch.
+        base_url (str): The base URL of the website.
+        css_selector (str): The CSS selector to target the content.
+        llm_strategy (LLMExtractionStrategy): The LLM extraction strategy.
+        session_id (str): The session identifier.
+        required_keys (List[str]): List of required keys in the business data.
+        seen_names (Set[str]): Set of business names that have already been seen.
+    Returns:
+        Tuple[List[dict], bool]:
+            - List[dict]: A list of processed businesss from the page.
+            - bool: A flag indicating if the "No Results Found" message was encountered.
+    """
+    url = base_url.format(page_number=page_number)
+    print(f"Loading page {page_number}...")
+    # Check if "No Results Found" message is present
+    no_results = await check_no_results(crawler, url, session_id)
+    if no_results:
+        return [], True  # No more results, signal to stop crawling
+    # Fetch page content with the extraction strategy
+    result = await crawler.arun(
+        url=url,
+        config=CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,  # Do not use cached data
+            extraction_strategy=llm_strategy,  # Strategy for data extraction
+            css_selector=css_selector,  # Target specific content on the page
+            session_id=session_id,  # Unique session ID for the crawl
+        ),
+    )
+    print("-----------------------------    Result-----------------------------")
+    print(result.extracted_content)
+    if not (result.success and result.extracted_content):
+        print(f"Error fetching page {page_number}: {result.error_message}")
+        return [], False
+    # Parse extracted content
+    extracted_data = json.loads(result.extracted_content)
+    print("----------------------------Exracted Data----------------------------")
+    print(extracted_data)
+    if not extracted_data:
+        print(f"No businesss found on page {page_number}.")
+        return [], False
+    # After parsing extracted content
+    print("Extracted data:", extracted_data)
+    # Process businesss
+    all_businesses = []
+    for business in extracted_data:
+        # Debugging: Print each business to understand its structure
+        print("Processing business:", business)
+        # Ignore the 'error' key if it's False
+        if business.get("error") is False:
+            business.pop("error", None)  # Remove the 'error' key if it's False
+        if is_duplicated(business["name"], seen_names):
+            print(f"Duplicate business '{business['name']}' found. Skipping.")
+            continue  # Skip duplicate businesss
+        # Add business to the list
+        seen_names.add(business["name"])
+        all_businesses.append(business)
+    if not all_businesses:
+        print(f"No complete businesss found on page {page_number}.")
+        return [], False
+    print(f"Extracted {len(all_businesses)} businesss from page {page_number}.")
+    return all_businesses, False  # Continue crawling

utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import csv
+from pydantic import BaseModel
+def is_duplicated(record: str, seen_names: set) -> bool:
+    return record in seen_names
+def save_data_to_csv(records: list, data_struct: BaseModel, filename: str):
+    if not records:
+        print("No records to save.")
+        return
+    # Use field names from the Pydantic data model
+    fieldnames = data_struct.model_fields.keys()
+    with open(filename, mode="w", newline="", encoding="utf-8") as file:
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+    print(f"Saved {len(records)} records to '{filename}'.")