kushagrasharma-13 commited on
Commit
17250b8
Β·
1 Parent(s): 5f2db56

Better Scraping

Browse files
Files changed (5) hide show
  1. .gitattributes +0 -35
  2. .gitignore +0 -1
  3. README.md +1 -0
  4. app.py +13 -12
  5. requirements.txt +1 -1
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,2 +1 @@
1
- __pychache__
2
  .env
 
 
1
  .env
README.md CHANGED
@@ -128,4 +128,5 @@ Special thanks to **Hugging Face** for hosting this space and **Groq AI** for th
128
  ### **πŸš€ Ready to get company insights?**
129
  Run the scraper and generate **detailed company reports effortlessly**! πŸ”
130
 
 
131
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
128
  ### **πŸš€ Ready to get company insights?**
129
  Run the scraper and generate **detailed company reports effortlessly**! πŸ”
130
 
131
+
132
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -32,11 +32,12 @@ scraper = cloudscraper.create_scraper()
32
  # Headers to mimic real browser requests
33
  HEADERS = {
34
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
35
- "Accept-Language": "en-US,en;q=0.9",
36
- "Referer": "https://www.google.com/"
 
37
  }
38
 
39
- # **Extract Links**
40
  def get_valid_links(base_url):
41
  """Extracts all internal links, including footer and JavaScript-rendered links."""
42
  try:
@@ -58,7 +59,7 @@ def get_valid_links(base_url):
58
 
59
  return links
60
  except requests.exceptions.RequestException as e:
61
- print(f" Error fetching links: {e}")
62
  return set()
63
 
64
  def check_footer_links(soup):
@@ -91,10 +92,10 @@ def get_links_with_selenium(url):
91
 
92
  return links
93
  except Exception as e:
94
- print(f" Selenium Error: {e}")
95
  return set()
96
 
97
- # **Scrape Pages**
98
  def scrape_page(url):
99
  """Scrapes a webpage, using Selenium if necessary."""
100
  try:
@@ -125,21 +126,21 @@ def scrape_with_selenium(url):
125
 
126
  return extract_text(soup)
127
  except Exception as e:
128
- return f" Selenium Scraping Error: {e}"
129
 
130
  def extract_text(soup):
131
  """Extracts **all** meaningful text from HTML content, including dynamic elements."""
132
 
133
- # Extracts all text from the HTML, not just specific tags
134
  all_text = soup.get_text(separator="\n", strip=True)
135
 
136
- # Removes duplicate lines & unwanted spaces
137
  unique_lines = set(all_text.split("\n"))
138
  cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
139
 
140
  return cleaned_text
141
 
142
- # **Chunking for Large AI Requests**
143
  def split_into_chunks(text, chunk_size):
144
  """Splits long content into manageable chunks for AI processing."""
145
  words = text.split()
@@ -160,7 +161,7 @@ def split_into_chunks(text, chunk_size):
160
 
161
  return chunks
162
 
163
- # **AI-Powered Company Breakdown**
164
  def generate_detailed_company_info(company_data):
165
  """Generates an in-depth company breakdown with AI."""
166
 
@@ -220,7 +221,7 @@ def generate_detailed_company_info(company_data):
220
  response = chain.invoke({"text": user_prompt_template})
221
  return response.content
222
 
223
- # **Streamlit UI**
224
  def main():
225
  st.title("πŸš€ AI-Powered Company Website Scraper")
226
  base_url = st.text_input("πŸ”— Enter Website URL", "")
 
32
  # Headers to mimic real browser requests
33
  HEADERS = {
34
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
35
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
36
+ "Connection": "keep-alive",
37
+ "Upgrade-Insecure-Requests": "1",
38
  }
39
 
40
+ # βœ… **Extract Links**
41
  def get_valid_links(base_url):
42
  """Extracts all internal links, including footer and JavaScript-rendered links."""
43
  try:
 
59
 
60
  return links
61
  except requests.exceptions.RequestException as e:
62
+ print(f"❌ Error fetching links: {e}")
63
  return set()
64
 
65
  def check_footer_links(soup):
 
92
 
93
  return links
94
  except Exception as e:
95
+ print(f"❌ Selenium Error: {e}")
96
  return set()
97
 
98
+ # βœ… **Scrape Pages**
99
  def scrape_page(url):
100
  """Scrapes a webpage, using Selenium if necessary."""
101
  try:
 
126
 
127
  return extract_text(soup)
128
  except Exception as e:
129
+ return f"❌ Selenium Scraping Error: {e}"
130
 
131
  def extract_text(soup):
132
  """Extracts **all** meaningful text from HTML content, including dynamic elements."""
133
 
134
+ # βœ… Extracts all text from the HTML, not just specific tags
135
  all_text = soup.get_text(separator="\n", strip=True)
136
 
137
+ # βœ… Removes duplicate lines & unwanted spaces
138
  unique_lines = set(all_text.split("\n"))
139
  cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
140
 
141
  return cleaned_text
142
 
143
+ # βœ… **Chunking for Large AI Requests**
144
  def split_into_chunks(text, chunk_size):
145
  """Splits long content into manageable chunks for AI processing."""
146
  words = text.split()
 
161
 
162
  return chunks
163
 
164
+ # βœ… **AI-Powered Company Breakdown**
165
  def generate_detailed_company_info(company_data):
166
  """Generates an in-depth company breakdown with AI."""
167
 
 
221
  response = chain.invoke({"text": user_prompt_template})
222
  return response.content
223
 
224
+ # βœ… **Streamlit UI**
225
  def main():
226
  st.title("πŸš€ AI-Powered Company Website Scraper")
227
  base_url = st.text_input("πŸ”— Enter Website URL", "")
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  urllib3==2.3.0
2
- requests==2.32.3
3
  selenium==4.21.0
 
4
  streamlit==1.41.1
5
  cloudscraper==1.2.71
6
  python-dotenv==1.0.1
 
1
  urllib3==2.3.0
 
2
  selenium==4.21.0
3
+ requests==2.32.3
4
  streamlit==1.41.1
5
  cloudscraper==1.2.71
6
  python-dotenv==1.0.1