kushagrasharma-13 commited on
Commit
5f2db56
Β·
1 Parent(s): 9fa8d4f

Better Scraping

Browse files
Files changed (1) hide show
  1. app.py +10 -10
app.py CHANGED
@@ -36,7 +36,7 @@ HEADERS = {
36
  "Referer": "https://www.google.com/"
37
  }
38
 
39
- # βœ… **Extract Links**
40
  def get_valid_links(base_url):
41
  """Extracts all internal links, including footer and JavaScript-rendered links."""
42
  try:
@@ -58,7 +58,7 @@ def get_valid_links(base_url):
58
 
59
  return links
60
  except requests.exceptions.RequestException as e:
61
- print(f"❌ Error fetching links: {e}")
62
  return set()
63
 
64
  def check_footer_links(soup):
@@ -91,10 +91,10 @@ def get_links_with_selenium(url):
91
 
92
  return links
93
  except Exception as e:
94
- print(f"❌ Selenium Error: {e}")
95
  return set()
96
 
97
- # βœ… **Scrape Pages**
98
  def scrape_page(url):
99
  """Scrapes a webpage, using Selenium if necessary."""
100
  try:
@@ -125,21 +125,21 @@ def scrape_with_selenium(url):
125
 
126
  return extract_text(soup)
127
  except Exception as e:
128
- return f"❌ Selenium Scraping Error: {e}"
129
 
130
  def extract_text(soup):
131
  """Extracts **all** meaningful text from HTML content, including dynamic elements."""
132
 
133
- # βœ… Extracts all text from the HTML, not just specific tags
134
  all_text = soup.get_text(separator="\n", strip=True)
135
 
136
- # βœ… Removes duplicate lines & unwanted spaces
137
  unique_lines = set(all_text.split("\n"))
138
  cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
139
 
140
  return cleaned_text
141
 
142
- # βœ… **Chunking for Large AI Requests**
143
  def split_into_chunks(text, chunk_size):
144
  """Splits long content into manageable chunks for AI processing."""
145
  words = text.split()
@@ -160,7 +160,7 @@ def split_into_chunks(text, chunk_size):
160
 
161
  return chunks
162
 
163
- # βœ… **AI-Powered Company Breakdown**
164
  def generate_detailed_company_info(company_data):
165
  """Generates an in-depth company breakdown with AI."""
166
 
@@ -220,7 +220,7 @@ def generate_detailed_company_info(company_data):
220
  response = chain.invoke({"text": user_prompt_template})
221
  return response.content
222
 
223
- # βœ… **Streamlit UI**
224
  def main():
225
  st.title("πŸš€ AI-Powered Company Website Scraper")
226
  base_url = st.text_input("πŸ”— Enter Website URL", "")
 
36
  "Referer": "https://www.google.com/"
37
  }
38
 
39
+ # **Extract Links**
40
  def get_valid_links(base_url):
41
  """Extracts all internal links, including footer and JavaScript-rendered links."""
42
  try:
 
58
 
59
  return links
60
  except requests.exceptions.RequestException as e:
61
+ print(f" Error fetching links: {e}")
62
  return set()
63
 
64
  def check_footer_links(soup):
 
91
 
92
  return links
93
  except Exception as e:
94
+ print(f" Selenium Error: {e}")
95
  return set()
96
 
97
+ # **Scrape Pages**
98
  def scrape_page(url):
99
  """Scrapes a webpage, using Selenium if necessary."""
100
  try:
 
125
 
126
  return extract_text(soup)
127
  except Exception as e:
128
+ return f" Selenium Scraping Error: {e}"
129
 
130
  def extract_text(soup):
131
  """Extracts **all** meaningful text from HTML content, including dynamic elements."""
132
 
133
+ # Extracts all text from the HTML, not just specific tags
134
  all_text = soup.get_text(separator="\n", strip=True)
135
 
136
+ # Removes duplicate lines & unwanted spaces
137
  unique_lines = set(all_text.split("\n"))
138
  cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
139
 
140
  return cleaned_text
141
 
142
+ # **Chunking for Large AI Requests**
143
  def split_into_chunks(text, chunk_size):
144
  """Splits long content into manageable chunks for AI processing."""
145
  words = text.split()
 
160
 
161
  return chunks
162
 
163
+ # **AI-Powered Company Breakdown**
164
  def generate_detailed_company_info(company_data):
165
  """Generates an in-depth company breakdown with AI."""
166
 
 
220
  response = chain.invoke({"text": user_prompt_template})
221
  return response.content
222
 
223
+ # **Streamlit UI**
224
  def main():
225
  st.title("πŸš€ AI-Powered Company Website Scraper")
226
  base_url = st.text_input("πŸ”— Enter Website URL", "")