marks commited on
Commit
4b172dd
·
1 Parent(s): a2f0554

Changed scrapers

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. scraper.py +7 -23
requirements.txt CHANGED
@@ -6,4 +6,5 @@ pydub==0.25.1 # audio processing library
6
  python-dotenv==1.0.0 # for environment variables
7
  requests==2.31.0 # for API calls
8
  numpy>1.24.3 # common dependency
9
- openrouter
 
 
6
  python-dotenv==1.0.0 # for environment variables
7
  requests==2.31.0 # for API calls
8
  numpy>1.24.3 # common dependency
9
+ openrouter
10
+ trafilatura>=1.6.1
scraper.py CHANGED
@@ -1,24 +1,8 @@
1
- def scrape_url(url):
2
- from browser_use import Browser
3
- from bs4 import BeautifulSoup
4
-
5
- # Initialize the browser
6
- browser = Browser()
7
-
8
- # Open the URL
9
- browser.open(url)
10
-
11
- # Get the page content
12
- content = browser.get_page_source()
13
 
14
- # Close the browser
15
- browser.close()
16
-
17
- # Parse the HTML content
18
- soup = BeautifulSoup(content, 'html.parser')
19
-
20
- # Extract relevant text (modify the selector as needed)
21
- text_elements = soup.find_all(['main'])
22
- text_content = ' '.join([element.get_text() for element in text_elements])
23
-
24
- return text_content.strip()
 
1
+ import trafilatura
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ def scrape_url(url):
4
+ downloaded = trafilatura.fetch_url(url)
5
+ if downloaded:
6
+ text = trafilatura.extract(downloaded, include_links=False, include_formatting=False)
7
+ return text if text else "No content could be extracted from the URL"
8
+ return "Failed to download the URL"