Spaces:

phxdev
/

podcaster

Running

marks commited on Jan 17

Commit

4b172dd

1 Parent(s): a2f0554

Changed scrapers

Files changed (2) hide show

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ pydub==0.25.1           # audio processing library
 python-dotenv==1.0.0    # for environment variables
 requests==2.31.0        # for API calls
 numpy>1.24.3           # common dependency
-openrouter

 python-dotenv==1.0.0    # for environment variables
 requests==2.31.0        # for API calls
 numpy>1.24.3           # common dependency
+openrouter
+trafilatura>=1.6.1

scraper.py CHANGED Viewed

@@ -1,24 +1,8 @@
-def scrape_url(url):
-    from browser_use import Browser
-    from bs4 import BeautifulSoup
-    # Initialize the browser
-    browser = Browser()
-    # Open the URL
-    browser.open(url)
-    # Get the page content
-    content = browser.get_page_source()
-    # Close the browser
-    browser.close()
-    # Parse the HTML content
-    soup = BeautifulSoup(content, 'html.parser')
-    # Extract relevant text (modify the selector as needed)
-    text_elements = soup.find_all(['main'])
-    text_content = ' '.join([element.get_text() for element in text_elements])
-    return text_content.strip()

+import trafilatura
+def scrape_url(url):
+    downloaded = trafilatura.fetch_url(url)
+    if downloaded:
+        text = trafilatura.extract(downloaded, include_links=False, include_formatting=False)
+        return text if text else "No content could be extracted from the URL"
+    return "Failed to download the URL"