gneya-bacancy commited on
Commit
03c34b1
·
verified ·
1 Parent(s): 70e4c96

Upload 8 files

Browse files
Files changed (8) hide show
  1. business.py +10 -0
  2. businesses_data.csv +15 -0
  3. config.py +45 -0
  4. index.py +14 -0
  5. main.py +173 -0
  6. requirements.txt +3 -0
  7. scraper.py +170 -0
  8. utils.py +19 -0
business.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+
3
+ class BusinessData(BaseModel):
4
+ name: str = Field(..., description="The name of the business or entity.")
5
+ price_rate: str = Field(..., description="The address of the business or entity.")
6
+ website: str = Field(..., description="The website URL of the business or entity.")
7
+ benefits: str = Field(..., description="The phone number of the business or entity.")
8
+ inclusions: str = Field(..., description="The inclusions of the business or entity.")
9
+
10
+ description: str = Field(..., description="A brief description of the business or entity.")
businesses_data.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,price_rate,website,benefits,inclusions,description
2
+ Bajaj Allianz General Insurance Company,,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,,,Bajaj Allianz health insurance offers a wide range of health insurance plans catering to the varying health needs of the public.
3
+ Bajaj Allianz,,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,,,Bajaj Allianz offers a variety of health insurance plans designed to provide comprehensive coverage for medical expenses.
4
+ Bajaj Allianz Health Insurance,N/A,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,1800-258-5881,N/A,Bajaj Allianz Health Insurance offers a range of health insurance plans to cover medical expenses and provide financial security.
5
+ Bajaj Allianz M-Care Plan,"Rs 10,000 to Rs 75,000",https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Lump sum benefit is paid on the first diagnosis of the disease.,A waiting period of 15 days is applicable.,Bajaj Allianz M-Care is a customized plan designed to provide coverage against 7 vector-borne diseases.
6
+ Bajaj Allianz Loan Care Plan,60 times the monthly income or the loan amount,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"Personal accident cover, including accidental death & permanent total disability.",Cost of transportation of the mortal remains is also covered.,The Bajaj Allianz Loan Care plan protects the insured from the burden of repaying a loan in case of any unfortunate incident.
7
+ Bajaj Allianz Global Personal Guard Plan,Up to 100 times the gross monthly income,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Accidental death and permanent total/partial disability are covered.,"Road ambulance, air ambulance and travel expenses are covered.",The Bajaj Allianz Global Personal Guard plan offers personal accident coverage to the insured across the globe.
8
+ Bajaj Allianz My Health Care Plan,Rs 1 lakh to Rs 5 crore,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"In-patient hospitalization, OPD treatments and AYUSH treatments are covered.","Maternity expenses, including surrogacy and ART complications, and baby care are covered.",The Bajaj Allianz My Health Care plan is a mix of a basic hospitalization plan and a super top-up plan that provides all-round medical protection.
9
+ Bajaj Allianz Personal Accident Plan,Up to 120 times the insured’s average monthly income,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Coverage for accidental death and permanent total disability are available.,Accidental medical expenses and hospital confinement allowance are available as optional covers.,The Bajaj Allianz Personal Accident plan covers the insured against any health uncertainties arising out of an accident.
10
+ Bajaj Allianz Hospital Cash Daily Allowance Plan,Rs 500 to Rs 2500 per day,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Daily allowance is paid for each day of hospitalization.,Double the cash amount is paid in case of ICU hospitalization.,The Bajaj Allianz Hospital Cash Daily Allowance plan covers the incidental expenses incurred during the hospitalization of the insured.
11
+ Bajaj Allianz Tax Gain Plan,Rs 1 lakh to Rs 3 lakh,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"OPD expenses, day care procedures and hospitalization expenses are covered.",Free health check-ups are available after every 4 claim-free years.,The Bajaj Allianz Tax Gain plan provides coverage for hospitalization and outpatient expenses of an entire family on a floater basis.
12
+ Bajaj Allianz Global Health Care Plan,"Domestic: Rs 37.5 lakh to Rs 3.75 crore; International: USD 100,000 to USD 1,000,000",https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Cost of in-patient treatments and pre-post hospitalization is covered within India and abroad.,Annual preventive health check-ups within India are available.,The Bajaj Allianz Global Health Care plan provides comprehensive coverage against medical expenses incurred within India and abroad.
13
+ Bajaj Allianz Critical Illness Plan,Rs 1 lakh to Rs 5 lakh,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Lump sum payment is made on the diagnosis of the critical illnesses.,A survival period of 30 days is applicable.,The Bajaj Allianz Critical Illness plan offers financial support to the insured for the treatment of 10 critical illnesses.
14
+ Bajaj Allianz Health Ensure Plan,"Rs 50,000 to Rs 10 lakh",https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,"Coverage for in-patient treatment, AYUSH treatment and day care procedures are available.",Preventive health check-up facilities are available after every 3 continuous renewals.,The Bajaj Allianz Health Ensure plan offers comprehensive coverage to the insured for medical expenses incurred due to an illness or injury.
15
+ Bajaj Allianz Sankat Mochan Plan,Up to 120 times the average monthly income,https://www.policybazaar.com/insurance-companies/bajaj-allianz-health-insurance/,Coverage for death and permanent total disability due to an accident are available.,Hospital confinement allowance for up to 30 days per policy year is available.,The Bajaj Allianz Sankat Mochan plan covers the insured against any accidents that may lead to a major financial burden.
config.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Specify the LLM model to use. You can choose any LLM supported by LiteLLM.
4
+ # Example options include "gpt-4o", "claude", "deepseek-chat", etc.
5
+ # For a full list of supported models, refer to:
6
+ # https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
7
+ #LLM_MODEL = "groq/llama3-8b-8192"
8
+ #LLM_MODEL= "gemini-2.5-pro-exp-03-25"
9
+ LLM_MODEL ="openai/gpt-4o-mini"
10
+
11
+ # API token for authentication with the LLM provider.
12
+ # This is fetched from the environment variable "GEMINI_API_KEY".
13
+ #PI_TOKEN = os.getenv("GEMINI_API_KEY")
14
+ #API_TOKEN="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"
15
+ #API_TOKEN="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"
16
+ import os
17
+ API_TOKEN= os.getenv("OPENAI_API_KEY")
18
+ # Base URL of the website to scrape.
19
+ # In this example, we are scraping Yellow Pages for dentists in Toronto, ON.
20
+ # You can modify the URL to change the location or the type of business.
21
+ # Example:
22
+ # - For plumbers in Vancouver: "https://www.yellowpages.ca/search/si/{page_number}/Plumbers/Vancouver+BC"
23
+ # - For restaurants in Montreal: "https://www.yellowpages.ca/search/si/{page_number}/Restaurants/Montreal+QC"
24
+ BASE_URL = "https://gentledental.ai/"
25
+
26
+ # CSS selector to target the main HTML element containing the business information.
27
+ # This is specific to Yellow Pages and helps focus the scraper on relevant content
28
+ # instead of sending the entire HTML page to the LLM.
29
+ # CSS_SELECTOR = "[class^='listing_right_section']"
30
+ CSS_SELECTOR = ""
31
+
32
+ # Maximum number of pages to crawl. Adjust this value based on how much data you want to scrape.
33
+ MAX_PAGES = 3 # Example: Set to 5 to scrape 5 pages.
34
+
35
+ # Instructions for the LLM on what information to extract from the scraped content.
36
+ # The LLM will extract the following details for each business:
37
+ # - Name
38
+ # - Address
39
+ # - Website
40
+ # - Phone number
41
+ # - A one-sentence description
42
+ SCRAPER_INSTRUCTIONS = (
43
+ "Extract all business information: 'name', 'address', 'website'"
44
+ ", 'phone number' and a one-sentence 'description' from the following content."
45
+ )
index.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from main import main
3
+ import asyncio
4
+ # Title
5
+ st.title("Agent")
6
+
7
+ # Text input
8
+ user_input = st.text_input("Input:")
9
+
10
+ # Button
11
+ if st.button("Submit"):
12
+ with st.spinner("Processing... Please wait."):
13
+ result = asyncio.run(main(user_input))
14
+ st.write(result)
main.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from crawl4ai import AsyncWebCrawler
3
+ from dotenv import load_dotenv
4
+ from config import API_TOKEN, LLM_MODEL
5
+ from config import BASE_URL, CSS_SELECTOR, MAX_PAGES, SCRAPER_INSTRUCTIONS
6
+ from utils import save_data_to_csv
7
+ from scraper import (
8
+ get_browser_config,
9
+ get_llm_strategy,
10
+ fetch_and_process_page
11
+ )
12
+ import os
13
+
14
+ from langchain.agents import AgentExecutor, create_openai_tools_agent
15
+ from langchain_groq import ChatGroq
16
+
17
+ from langchain.tools import BaseTool, StructuredTool, tool
18
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
19
+ from business import BusinessData
20
+ from langchain_core.tools import tool
21
+ from langchain_openai import ChatOpenAI
22
+ from langchain_groq import ChatGroq
23
+ from langchain.agents import AgentExecutor, create_openai_tools_agent
24
+ from langchain_community.utilities import GoogleSerperAPIWrapper
25
+
26
+
27
+ load_dotenv()
28
+ @tool
29
+ def search(query):
30
+ """Search for links using the Google Serper API."""
31
+ print("Searching for links...")
32
+ serp_tool= GoogleSerperAPIWrapper(
33
+ serper_api_key="76238dbeac9defaae61715ba4e928a42c0e98e6e")
34
+ data = serp_tool.results(query)
35
+ links = []
36
+
37
+ # Extract links from the 'organic' results
38
+ if 'organic' in data:
39
+ for result in data['organic']:
40
+ if 'link' in result:
41
+ links.append(result['link'])
42
+
43
+ # Extract links from 'sitelinks' within organic results
44
+ if 'organic' in data:
45
+ for result in data['organic']:
46
+ if 'sitelinks' in result:
47
+ for sitelink in result['sitelinks']:
48
+ if 'link' in sitelink:
49
+ links.append(sitelink['link'])
50
+
51
+ # Extract links from 'peopleAlsoAsk'
52
+ if 'peopleAlsoAsk' in data:
53
+ for item in data['peopleAlsoAsk']:
54
+ if 'link' in item:
55
+ links.append(item['link'])
56
+ print(links)
57
+ return links[:5]
58
+
59
+ @tool
60
+ async def scrape(url):
61
+ """
62
+ Function to scrape data once search tool is called and url is available.
63
+ """
64
+
65
+ # Initialize configurations
66
+ browser_config = get_browser_config()
67
+ llm_strategy = get_llm_strategy(
68
+ llm_instructions=SCRAPER_INSTRUCTIONS, # Instructions for the LLM
69
+ output_format=BusinessData # Data output format
70
+ )
71
+ session_id = "crawler_session"
72
+
73
+ # Initialize state variables
74
+ page_number = 1
75
+ all_records = []
76
+ seen_names = set()
77
+
78
+ # Start the web crawler context
79
+ # https://docs.crawl4ai.com/api/async-webcrawler/#asyncwebcrawler
80
+ async with AsyncWebCrawler(config=browser_config) as crawler:
81
+ while True:
82
+ # Fetch and process data from the current page
83
+ records, no_results_found = await fetch_and_process_page(
84
+ crawler,
85
+ page_number,
86
+ url,
87
+ CSS_SELECTOR,
88
+ llm_strategy,
89
+ session_id,
90
+ seen_names,
91
+ )
92
+
93
+ if no_results_found:
94
+ print("No more records found. Ending crawl.")
95
+ break # Stop crawling when "No Results Found" message appears
96
+
97
+ if not records:
98
+ print(f"No records extracted from page {page_number}.")
99
+ break # Stop if no records are extracted
100
+
101
+ # Add the records from this page to the total list
102
+ all_records.extend(records)
103
+ page_number += 1 # Move to the next
104
+
105
+ if page_number > MAX_PAGES:
106
+ break
107
+
108
+ # Pause between requests to avoid rate limits
109
+ await asyncio.sleep(2) # Adjust sleep time as needed
110
+
111
+ # Save the collected records to a CSV file
112
+ if all_records:
113
+ save_data_to_csv(
114
+ records=all_records,
115
+ data_struct=BusinessData,
116
+ filename="businesses_data.csv"
117
+ )
118
+ else:
119
+ print("No records were found during the crawl.")
120
+
121
+ # Display usage statistics for the LLM strategy
122
+ llm_strategy.show_usage()
123
+ return all_records
124
+
125
+
126
+ async def main(input):
127
+ prompt = ChatPromptTemplate.from_messages(
128
+ [
129
+ ("system", """"You are a web scraping assistant.
130
+ Your have task to do:
131
+ - If the user_input has sites mentioned then directly use the scrapping tool to scrape those data
132
+ - Once the search tool is called then only call scrape tool
133
+ - If the user_input has no sites mentioned then use the search tool to get the sites and then use the scrapping tool to scrape those data
134
+ - Scrape the website and extract the business information.
135
+ - Information such as plan name, price, inclusions, benefits and every minor details of the plan has to be included.
136
+ - If the required data is not specified, create the query for search tool to find that data and again find the websites that might have infromation then use the scrape tool to get the required information.
137
+ - For example if the price of health and life insurance is not specificed, create the search query for such as user_input price and then search and scrape.
138
+ - Continue till every infromation of the plan is not convered. Such as plan name, it's price, inclusions, benefits and every minor details.
139
+ - Focus on one plan get all the required details of that plan and then move to the next plan.
140
+ - Information such as plans and all the details of the plans is to be fetched.
141
+ - Return information in Table format only. Use markdowns and return the data in table format.
142
+ """),
143
+ MessagesPlaceholder("chat_history", optional=True),
144
+ ("human", "User_input: {input}"),
145
+ MessagesPlaceholder("agent_scratchpad"),
146
+ ]
147
+ )
148
+ tools = [scrape,search]
149
+ # model = "llama3-8b-8192"
150
+ # llm = ChatGroq(api_key=API_TOKEN, model=model)
151
+ model = "gpt-4o-mini"
152
+ api_key = os.getenv("OPENAI_API_KEY")
153
+ llm = ChatOpenAI(api_key=api_key, model=model, temperature=0.0)
154
+ """
155
+ Entry point of the script.
156
+ """
157
+ # llm_tools=llm.bind_tools(tools)
158
+
159
+ # print(llm_tools.invoke("Scrape https://example.com/ for me"))
160
+ agent = create_openai_tools_agent(llm, tools, prompt)
161
+
162
+ # Create an agent executor by passing in the agent and tools
163
+ agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
164
+ result = await agent_executor.ainvoke({"input": input})
165
+ return result['output']
166
+
167
+ print(result['output'])
168
+ # url = input()
169
+ # await crawl_yellowpages(url)
170
+
171
+
172
+ if __name__ == "__main__":
173
+ asyncio.run(main())
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Crawl4AI==0.4.247
2
+ python-dotenv==1.0.1
3
+ pydantic==2.10.6
scraper.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pydantic import BaseModel
3
+ from typing import List, Set, Tuple
4
+ from crawl4ai import (
5
+ AsyncWebCrawler,
6
+ BrowserConfig,
7
+ CacheMode,
8
+ CrawlerRunConfig,
9
+ LLMExtractionStrategy,
10
+ )
11
+ from utils import is_duplicated
12
+ from config import LLM_MODEL, API_TOKEN
13
+
14
+
15
+ def get_browser_config() -> BrowserConfig:
16
+ """
17
+ Returns the browser configuration for the crawler.
18
+
19
+ Returns:
20
+ BrowserConfig: The configuration settings for the browser.
21
+ """
22
+ # https://docs.crawl4ai.com/core/browser-crawler-config/
23
+ return BrowserConfig(
24
+ browser_type="chromium", # Type of browser to simulate
25
+ headless=True, # Whether to run in headless mode (no GUI)
26
+ verbose=True, # Enable verbose logging
27
+ )
28
+
29
+
30
+ def get_llm_strategy(llm_instructions: str, output_format: BaseModel) -> LLMExtractionStrategy:
31
+ """
32
+ Returns the configuration for the language model extraction strategy.
33
+
34
+ Returns:
35
+ LLMExtractionStrategy: The settings for how to extract data using LLM.
36
+ """
37
+ # https://docs.crawl4ai.com/api/strategies/#llmextractionstrategy
38
+ return LLMExtractionStrategy(
39
+ provider=LLM_MODEL, # Name of the LLM provider
40
+ api_token=API_TOKEN, # API token for authentication
41
+ schema=output_format.model_json_schema(), # JSON schema of the data model
42
+ extraction_type="schema", # Type of extraction to perform
43
+ instruction=llm_instructions, # Instructions for the LLM
44
+ input_format="markdown", # Format of the input content
45
+ verbose=True, # Enable verbose logging
46
+ )
47
+
48
+ async def check_no_results(
49
+ crawler: AsyncWebCrawler,
50
+ url: str,
51
+ session_id: str,
52
+ ) -> bool:
53
+ """
54
+ Checks if the "No Results Found" message is present on the page.
55
+
56
+ Args:
57
+ crawler (AsyncWebCrawler): The web crawler instance.
58
+ url (str): The URL to check.
59
+ session_id (str): The session identifier.
60
+
61
+ Returns:
62
+ bool: True if "No Results Found" message is found, False otherwise.
63
+ """
64
+ # Fetch the page without any CSS selector or extraction strategy
65
+ result = await crawler.arun(
66
+ url=url,
67
+ config=CrawlerRunConfig(
68
+ cache_mode=CacheMode.BYPASS,
69
+ session_id=session_id,
70
+ ),
71
+ )
72
+
73
+ if result.success:
74
+ if "No Results Found" in result.cleaned_html:
75
+ return True
76
+ else:
77
+ print(
78
+ f"Error fetching page for 'No Results Found' check: {result.error_message}"
79
+ )
80
+
81
+ return False
82
+
83
+
84
+ async def fetch_and_process_page(
85
+ crawler: AsyncWebCrawler,
86
+ page_number: int,
87
+ base_url: str,
88
+ css_selector: str,
89
+ llm_strategy: LLMExtractionStrategy,
90
+ session_id: str,
91
+ seen_names: Set[str],
92
+ ) -> Tuple[List[dict], bool]:
93
+ """
94
+ Fetches and processes a single page from yellowpages.
95
+
96
+ Args:
97
+ crawler (AsyncWebCrawler): The web crawler instance.
98
+ page_number (int): The page number to fetch.
99
+ base_url (str): The base URL of the website.
100
+ css_selector (str): The CSS selector to target the content.
101
+ llm_strategy (LLMExtractionStrategy): The LLM extraction strategy.
102
+ session_id (str): The session identifier.
103
+ required_keys (List[str]): List of required keys in the business data.
104
+ seen_names (Set[str]): Set of business names that have already been seen.
105
+
106
+ Returns:
107
+ Tuple[List[dict], bool]:
108
+ - List[dict]: A list of processed businesss from the page.
109
+ - bool: A flag indicating if the "No Results Found" message was encountered.
110
+ """
111
+ url = base_url.format(page_number=page_number)
112
+ print(f"Loading page {page_number}...")
113
+
114
+ # Check if "No Results Found" message is present
115
+ no_results = await check_no_results(crawler, url, session_id)
116
+ if no_results:
117
+ return [], True # No more results, signal to stop crawling
118
+
119
+ # Fetch page content with the extraction strategy
120
+ result = await crawler.arun(
121
+ url=url,
122
+ config=CrawlerRunConfig(
123
+ cache_mode=CacheMode.BYPASS, # Do not use cached data
124
+ extraction_strategy=llm_strategy, # Strategy for data extraction
125
+ css_selector=css_selector, # Target specific content on the page
126
+ session_id=session_id, # Unique session ID for the crawl
127
+ ),
128
+ )
129
+ print("----------------------------- Result-----------------------------")
130
+ print(result.extracted_content)
131
+
132
+ if not (result.success and result.extracted_content):
133
+ print(f"Error fetching page {page_number}: {result.error_message}")
134
+ return [], False
135
+
136
+ # Parse extracted content
137
+ extracted_data = json.loads(result.extracted_content)
138
+ print("----------------------------Exracted Data----------------------------")
139
+ print(extracted_data)
140
+ if not extracted_data:
141
+ print(f"No businesss found on page {page_number}.")
142
+ return [], False
143
+
144
+ # After parsing extracted content
145
+ print("Extracted data:", extracted_data)
146
+
147
+ # Process businesss
148
+ all_businesses = []
149
+ for business in extracted_data:
150
+ # Debugging: Print each business to understand its structure
151
+ print("Processing business:", business)
152
+
153
+ # Ignore the 'error' key if it's False
154
+ if business.get("error") is False:
155
+ business.pop("error", None) # Remove the 'error' key if it's False
156
+
157
+ if is_duplicated(business["name"], seen_names):
158
+ print(f"Duplicate business '{business['name']}' found. Skipping.")
159
+ continue # Skip duplicate businesss
160
+
161
+ # Add business to the list
162
+ seen_names.add(business["name"])
163
+ all_businesses.append(business)
164
+
165
+ if not all_businesses:
166
+ print(f"No complete businesss found on page {page_number}.")
167
+ return [], False
168
+
169
+ print(f"Extracted {len(all_businesses)} businesss from page {page_number}.")
170
+ return all_businesses, False # Continue crawling
utils.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ from pydantic import BaseModel
3
+
4
+ def is_duplicated(record: str, seen_names: set) -> bool:
5
+ return record in seen_names
6
+
7
+ def save_data_to_csv(records: list, data_struct: BaseModel, filename: str):
8
+ if not records:
9
+ print("No records to save.")
10
+ return
11
+
12
+ # Use field names from the Pydantic data model
13
+ fieldnames = data_struct.model_fields.keys()
14
+
15
+ with open(filename, mode="w", newline="", encoding="utf-8") as file:
16
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
17
+ writer.writeheader()
18
+ writer.writerows(records)
19
+ print(f"Saved {len(records)} records to '{filename}'.")