Spaces:
Sleeping
Sleeping
Update scraper.py
Browse files- scraper.py +7 -7
scraper.py
CHANGED
|
@@ -6,9 +6,6 @@ from bs4 import BeautifulSoup
|
|
| 6 |
import requests
|
| 7 |
import time
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
class Scraper:
|
| 13 |
@staticmethod
|
| 14 |
async def power_scrapper_2(url):
|
|
@@ -32,7 +29,10 @@ class Scraper:
|
|
| 32 |
# Get page content (text from paragraphs and headers)
|
| 33 |
page_content = await page.evaluate("""() => {
|
| 34 |
let elements = Array.from(document.querySelectorAll('body *'));
|
| 35 |
-
return elements
|
|
|
|
|
|
|
|
|
|
| 36 |
}""")
|
| 37 |
|
| 38 |
|
|
@@ -99,7 +99,7 @@ class Scraper:
|
|
| 99 |
async def scrape(url):
|
| 100 |
try:
|
| 101 |
headers = {'User-Agent': 'Mozilla/5.0'}
|
| 102 |
-
response = requests.get(url,timeout=
|
| 103 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 104 |
|
| 105 |
title = Scraper.get_title(soup)
|
|
@@ -109,8 +109,8 @@ class Scraper:
|
|
| 109 |
if not links:
|
| 110 |
print("Running alternative scrapper")
|
| 111 |
links, text_content = await Scraper.power_scrapper_2(url)
|
|
|
|
| 112 |
return {"title": title, "URL": links, "Content": text_content}
|
| 113 |
except:
|
| 114 |
-
print("Running alternative scrapper second time")
|
| 115 |
title,links, text_content = await Scraper.power_scrapper_2(url)
|
| 116 |
-
return {"title": title, "URL": links, "Content": text_content}
|
|
|
|
| 6 |
import requests
|
| 7 |
import time
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
class Scraper:
|
| 10 |
@staticmethod
|
| 11 |
async def power_scrapper_2(url):
|
|
|
|
| 29 |
# Get page content (text from paragraphs and headers)
|
| 30 |
page_content = await page.evaluate("""() => {
|
| 31 |
let elements = Array.from(document.querySelectorAll('body *'));
|
| 32 |
+
return elements
|
| 33 |
+
.filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
|
| 34 |
+
.map(element => element.innerText.trim())
|
| 35 |
+
.join('\\n');
|
| 36 |
}""")
|
| 37 |
|
| 38 |
|
|
|
|
| 99 |
async def scrape(url):
|
| 100 |
try:
|
| 101 |
headers = {'User-Agent': 'Mozilla/5.0'}
|
| 102 |
+
response = requests.get(url,timeout=3)
|
| 103 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 104 |
|
| 105 |
title = Scraper.get_title(soup)
|
|
|
|
| 109 |
if not links:
|
| 110 |
print("Running alternative scrapper")
|
| 111 |
links, text_content = await Scraper.power_scrapper_2(url)
|
| 112 |
+
|
| 113 |
return {"title": title, "URL": links, "Content": text_content}
|
| 114 |
except:
|
|
|
|
| 115 |
title,links, text_content = await Scraper.power_scrapper_2(url)
|
| 116 |
+
return {"title": title, "URL": links, "Content": text_content}
|