Commit
Β·
17250b8
1
Parent(s):
5f2db56
Better Scraping
Browse files- .gitattributes +0 -35
- .gitignore +0 -1
- README.md +1 -0
- app.py +13 -12
- requirements.txt +1 -1
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
@@ -1,2 +1 @@
|
|
1 |
-
__pychache__
|
2 |
.env
|
|
|
|
|
1 |
.env
|
README.md
CHANGED
@@ -128,4 +128,5 @@ Special thanks to **Hugging Face** for hosting this space and **Groq AI** for th
|
|
128 |
### **π Ready to get company insights?**
|
129 |
Run the scraper and generate **detailed company reports effortlessly**! π
|
130 |
|
|
|
131 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
128 |
### **π Ready to get company insights?**
|
129 |
Run the scraper and generate **detailed company reports effortlessly**! π
|
130 |
|
131 |
+
|
132 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -32,11 +32,12 @@ scraper = cloudscraper.create_scraper()
|
|
32 |
# Headers to mimic real browser requests
|
33 |
HEADERS = {
|
34 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
35 |
-
"Accept
|
36 |
-
"
|
|
|
37 |
}
|
38 |
|
39 |
-
#
|
40 |
def get_valid_links(base_url):
|
41 |
"""Extracts all internal links, including footer and JavaScript-rendered links."""
|
42 |
try:
|
@@ -58,7 +59,7 @@ def get_valid_links(base_url):
|
|
58 |
|
59 |
return links
|
60 |
except requests.exceptions.RequestException as e:
|
61 |
-
print(f" Error fetching links: {e}")
|
62 |
return set()
|
63 |
|
64 |
def check_footer_links(soup):
|
@@ -91,10 +92,10 @@ def get_links_with_selenium(url):
|
|
91 |
|
92 |
return links
|
93 |
except Exception as e:
|
94 |
-
print(f" Selenium Error: {e}")
|
95 |
return set()
|
96 |
|
97 |
-
#
|
98 |
def scrape_page(url):
|
99 |
"""Scrapes a webpage, using Selenium if necessary."""
|
100 |
try:
|
@@ -125,21 +126,21 @@ def scrape_with_selenium(url):
|
|
125 |
|
126 |
return extract_text(soup)
|
127 |
except Exception as e:
|
128 |
-
return f" Selenium Scraping Error: {e}"
|
129 |
|
130 |
def extract_text(soup):
|
131 |
"""Extracts **all** meaningful text from HTML content, including dynamic elements."""
|
132 |
|
133 |
-
#
|
134 |
all_text = soup.get_text(separator="\n", strip=True)
|
135 |
|
136 |
-
#
|
137 |
unique_lines = set(all_text.split("\n"))
|
138 |
cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
|
139 |
|
140 |
return cleaned_text
|
141 |
|
142 |
-
#
|
143 |
def split_into_chunks(text, chunk_size):
|
144 |
"""Splits long content into manageable chunks for AI processing."""
|
145 |
words = text.split()
|
@@ -160,7 +161,7 @@ def split_into_chunks(text, chunk_size):
|
|
160 |
|
161 |
return chunks
|
162 |
|
163 |
-
#
|
164 |
def generate_detailed_company_info(company_data):
|
165 |
"""Generates an in-depth company breakdown with AI."""
|
166 |
|
@@ -220,7 +221,7 @@ def generate_detailed_company_info(company_data):
|
|
220 |
response = chain.invoke({"text": user_prompt_template})
|
221 |
return response.content
|
222 |
|
223 |
-
#
|
224 |
def main():
|
225 |
st.title("π AI-Powered Company Website Scraper")
|
226 |
base_url = st.text_input("π Enter Website URL", "")
|
|
|
32 |
# Headers to mimic real browser requests
|
33 |
HEADERS = {
|
34 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
35 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
36 |
+
"Connection": "keep-alive",
|
37 |
+
"Upgrade-Insecure-Requests": "1",
|
38 |
}
|
39 |
|
40 |
+
# β
**Extract Links**
|
41 |
def get_valid_links(base_url):
|
42 |
"""Extracts all internal links, including footer and JavaScript-rendered links."""
|
43 |
try:
|
|
|
59 |
|
60 |
return links
|
61 |
except requests.exceptions.RequestException as e:
|
62 |
+
print(f"β Error fetching links: {e}")
|
63 |
return set()
|
64 |
|
65 |
def check_footer_links(soup):
|
|
|
92 |
|
93 |
return links
|
94 |
except Exception as e:
|
95 |
+
print(f"β Selenium Error: {e}")
|
96 |
return set()
|
97 |
|
98 |
+
# β
**Scrape Pages**
|
99 |
def scrape_page(url):
|
100 |
"""Scrapes a webpage, using Selenium if necessary."""
|
101 |
try:
|
|
|
126 |
|
127 |
return extract_text(soup)
|
128 |
except Exception as e:
|
129 |
+
return f"β Selenium Scraping Error: {e}"
|
130 |
|
131 |
def extract_text(soup):
|
132 |
"""Extracts **all** meaningful text from HTML content, including dynamic elements."""
|
133 |
|
134 |
+
# β
Extracts all text from the HTML, not just specific tags
|
135 |
all_text = soup.get_text(separator="\n", strip=True)
|
136 |
|
137 |
+
# β
Removes duplicate lines & unwanted spaces
|
138 |
unique_lines = set(all_text.split("\n"))
|
139 |
cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
|
140 |
|
141 |
return cleaned_text
|
142 |
|
143 |
+
# β
**Chunking for Large AI Requests**
|
144 |
def split_into_chunks(text, chunk_size):
|
145 |
"""Splits long content into manageable chunks for AI processing."""
|
146 |
words = text.split()
|
|
|
161 |
|
162 |
return chunks
|
163 |
|
164 |
+
# β
**AI-Powered Company Breakdown**
|
165 |
def generate_detailed_company_info(company_data):
|
166 |
"""Generates an in-depth company breakdown with AI."""
|
167 |
|
|
|
221 |
response = chain.invoke({"text": user_prompt_template})
|
222 |
return response.content
|
223 |
|
224 |
+
# β
**Streamlit UI**
|
225 |
def main():
|
226 |
st.title("π AI-Powered Company Website Scraper")
|
227 |
base_url = st.text_input("π Enter Website URL", "")
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
urllib3==2.3.0
|
2 |
-
requests==2.32.3
|
3 |
selenium==4.21.0
|
|
|
4 |
streamlit==1.41.1
|
5 |
cloudscraper==1.2.71
|
6 |
python-dotenv==1.0.1
|
|
|
1 |
urllib3==2.3.0
|
|
|
2 |
selenium==4.21.0
|
3 |
+
requests==2.32.3
|
4 |
streamlit==1.41.1
|
5 |
cloudscraper==1.2.71
|
6 |
python-dotenv==1.0.1
|